From a5cc897cdedfdca018a83fac5734ebe086acb817 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 24 Jan 2025 13:13:18 +0000 Subject: [PATCH 001/432] [gn build] Port 0ee037b861f9 --- llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 9226658d4c767..a4a17a22c1f2c 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -181,6 +181,8 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUPromoteKernelArguments.cpp", "AMDGPURegBankCombiner.cpp", "AMDGPURegBankLegalize.cpp", + "AMDGPURegBankLegalizeHelper.cpp", + "AMDGPURegBankLegalizeRules.cpp", "AMDGPURegBankSelect.cpp", "AMDGPURegisterBankInfo.cpp", "AMDGPURemoveIncompatibleFunctions.cpp", From 6292a808b3524d9ba6f4ce55bc5b9e547b088dd8 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Fri, 24 Jan 2025 13:27:56 +0000 Subject: [PATCH 002/432] [NFC][DebugInfo] Use iterator-flavour getFirstNonPHI at many call-sites (#123737) As part of the "RemoveDIs" project, BasicBlock::iterator now carries a debug-info bit that's needed when getFirstNonPHI and similar feed into instruction insertion positions. Call-sites where that's necessary were updated a year ago; but to ensure some type safety however, we'd like to have all calls to getFirstNonPHI use the iterator-returning version. This patch changes a bunch of call-sites calling getFirstNonPHI to use getFirstNonPHIIt, which returns an iterator. All these call sites are where it's obviously safe to fetch the iterator then dereference it. A follow-up patch will contain less-obviously-safe changes. We'll eventually deprecate and remove the instruction-pointer getFirstNonPHI, but not before adding concise documentation of what considerations are needed (very few). --------- Co-authored-by: Stephen Tozer --- clang/lib/CodeGen/CGException.cpp | 7 ++- clang/lib/CodeGen/MicrosoftCXXABI.cpp | 2 +- llvm/include/llvm/IR/BasicBlock.h | 2 +- .../llvm/Transforms/Utils/Instrumentation.h | 5 ++ llvm/lib/Analysis/Loads.cpp | 2 +- llvm/lib/Analysis/LoopNestAnalysis.cpp | 2 +- llvm/lib/Analysis/MustExecute.cpp | 2 +- llvm/lib/Analysis/ValueTracking.cpp | 2 +- llvm/lib/CodeGen/AsmPrinter/WinException.cpp | 4 +- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 6 +- llvm/lib/CodeGen/GlobalMerge.cpp | 2 +- llvm/lib/CodeGen/MachineFunction.cpp | 3 +- llvm/lib/CodeGen/SelectOptimize.cpp | 2 +- .../SelectionDAG/FunctionLoweringInfo.cpp | 6 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 4 +- .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 9 ++- llvm/lib/CodeGen/WasmEHPrepare.cpp | 10 ++-- llvm/lib/CodeGen/WinEHPrepare.cpp | 58 ++++++++++--------- llvm/lib/IR/EHPersonalities.cpp | 2 +- llvm/lib/IR/Instructions.cpp | 2 +- llvm/lib/IR/Verifier.cpp | 20 +++---- llvm/lib/Target/BPF/BPFAdjustOpt.cpp | 2 +- .../Hexagon/HexagonLoopIdiomRecognition.cpp | 3 +- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 4 +- llvm/lib/Target/X86/X86WinEHState.cpp | 6 +- .../Coroutines/MaterializationUtils.cpp | 6 +- llvm/lib/Transforms/IPO/IROutliner.cpp | 2 +- llvm/lib/Transforms/IPO/PartialInlining.cpp | 2 +- .../Instrumentation/AddressSanitizer.cpp | 6 +- .../Instrumentation/MemorySanitizer.cpp | 5 +- .../NumericalStabilitySanitizer.cpp | 2 +- .../Instrumentation/PGOInstrumentation.cpp | 6 +- .../Instrumentation/PGOMemOPSizeOpt.cpp | 2 +- .../Instrumentation/ThreadSanitizer.cpp | 6 +- llvm/lib/Transforms/ObjCARC/ObjCARC.cpp | 4 +- .../Transforms/ObjCARC/ObjCARCContract.cpp | 2 +- llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp | 3 +- llvm/lib/Transforms/Scalar/GVN.cpp | 3 +- llvm/lib/Transforms/Scalar/GVNSink.cpp | 2 +- llvm/lib/Transforms/Scalar/LICM.cpp | 7 ++- .../Transforms/Scalar/LoopIdiomRecognize.cpp | 12 ++-- .../lib/Transforms/Scalar/LoopInterchange.cpp | 17 +++--- .../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 2 +- llvm/lib/Transforms/Scalar/SCCP.cpp | 2 +- .../Transforms/Utils/BreakCriticalEdges.cpp | 6 +- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 6 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 47 +++++++-------- llvm/lib/Transforms/Utils/Local.cpp | 6 +- llvm/lib/Transforms/Utils/LoopSimplify.cpp | 2 +- .../Transforms/Utils/LowerMemIntrinsics.cpp | 11 ++-- llvm/lib/Transforms/Utils/MoveAutoInit.cpp | 2 +- llvm/lib/Transforms/Utils/SSAUpdater.cpp | 4 +- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +-- .../llvm-reduce/deltas/ReduceBasicBlocks.cpp | 2 +- llvm/unittests/Analysis/MemorySSATest.cpp | 2 +- .../Analysis/ProfileSummaryInfoTest.cpp | 10 ++-- .../Frontend/OpenMPIRBuilderTest.cpp | 16 ++--- llvm/unittests/IR/DebugInfoTest.cpp | 10 ++-- llvm/unittests/IR/InstructionsTest.cpp | 4 +- llvm/unittests/Transforms/Scalar/LICMTest.cpp | 4 +- polly/lib/CodeGen/BlockGenerators.cpp | 8 +-- polly/lib/CodeGen/LoopGenerators.cpp | 2 +- .../lib/Transform/MaximalStaticExpansion.cpp | 4 +- 64 files changed, 217 insertions(+), 200 deletions(-) diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index 5dc1686e7914c..5a395c924333e 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -1251,11 +1251,12 @@ void CodeGenFunction::ExitCXXTryStmt(const CXXTryStmt &S, bool IsFnTryBlock) { llvm::BasicBlock *WasmCatchStartBlock = nullptr; if (EHPersonality::get(*this).isWasmPersonality()) { auto *CatchSwitch = - cast(DispatchBlock->getFirstNonPHI()); + cast(DispatchBlock->getFirstNonPHIIt()); WasmCatchStartBlock = CatchSwitch->hasUnwindDest() ? CatchSwitch->getSuccessor(1) : CatchSwitch->getSuccessor(0); - auto *CPI = cast(WasmCatchStartBlock->getFirstNonPHI()); + auto *CPI = + cast(WasmCatchStartBlock->getFirstNonPHIIt()); CurrentFuncletPad = CPI; } @@ -2252,7 +2253,7 @@ void CodeGenFunction::ExitSEHTryStmt(const SEHTryStmt &S) { // __except blocks don't get outlined into funclets, so immediately do a // catchret. llvm::CatchPadInst *CPI = - cast(CatchPadBB->getFirstNonPHI()); + cast(CatchPadBB->getFirstNonPHIIt()); llvm::BasicBlock *ExceptBB = createBasicBlock("__except"); Builder.CreateCatchRet(CPI, ExceptBB); EmitBlock(ExceptBB); diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 90651c3bafe26..0d53e8cb45fe7 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -918,7 +918,7 @@ void MicrosoftCXXABI::emitBeginCatch(CodeGenFunction &CGF, VarDecl *CatchParam = S->getExceptionDecl(); llvm::BasicBlock *CatchPadBB = CGF.Builder.GetInsertBlock(); llvm::CatchPadInst *CPI = - cast(CatchPadBB->getFirstNonPHI()); + cast(CatchPadBB->getFirstNonPHIIt()); CGF.CurrentFuncletPad = CPI; // If this is a catch-all or the catch parameter is unnamed, we don't need to diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h index f85b221a211b9..e22fe1e7e7dc8 100644 --- a/llvm/include/llvm/IR/BasicBlock.h +++ b/llvm/include/llvm/IR/BasicBlock.h @@ -673,7 +673,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also void replaceSuccessorsPhiUsesWith(BasicBlock *New); /// Return true if this basic block is an exception handling block. - bool isEHPad() const { return getFirstNonPHI()->isEHPad(); } + bool isEHPad() const { return getFirstNonPHIIt()->isEHPad(); } /// Return true if this basic block is a landing pad. /// diff --git a/llvm/include/llvm/Transforms/Utils/Instrumentation.h b/llvm/include/llvm/Transforms/Utils/Instrumentation.h index 4f67d079d1469..0e2c0d9bfa605 100644 --- a/llvm/include/llvm/Transforms/Utils/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Utils/Instrumentation.h @@ -204,6 +204,11 @@ struct InstrumentationIRBuilder : IRBuilder<> { explicit InstrumentationIRBuilder(Instruction *IP) : IRBuilder<>(IP) { ensureDebugInfo(*this, *IP->getFunction()); } + + explicit InstrumentationIRBuilder(BasicBlock *BB, BasicBlock::iterator It) + : IRBuilder<>(BB, It) { + ensureDebugInfo(*this, *BB->getParent()); + } }; } // end namespace llvm diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 11ccfa33821ca..9279f19b72a3f 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -284,7 +284,7 @@ bool llvm::isDereferenceableAndAlignedInLoop( DL.getTypeStoreSize(LI->getType()).getFixedValue()); const Align Alignment = LI->getAlign(); - Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); + Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt(); // If given a uniform (i.e. non-varying) address, see if we can prove the // access is safe within the loop w/o needing predication. diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp index fe6d270b9ac53..ead5cf610d9e1 100644 --- a/llvm/lib/Analysis/LoopNestAnalysis.cpp +++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp @@ -346,7 +346,7 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop, // "guarded" inner loop which contains "only" Phi nodes corresponding to the // LCSSA Phi nodes in the exit block. auto IsExtraPhiBlock = [&](const BasicBlock &BB) { - return BB.getFirstNonPHI() == BB.getTerminator() && + return &*BB.getFirstNonPHIIt() == BB.getTerminator() && all_of(BB.phis(), [&](const PHINode &PN) { return all_of(PN.blocks(), [&](const BasicBlock *IncomingBlock) { return IncomingBlock == InnerLoopExit || diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp index d5c665753075c..fde6bbf9eb181 100644 --- a/llvm/lib/Analysis/MustExecute.cpp +++ b/llvm/lib/Analysis/MustExecute.cpp @@ -275,7 +275,7 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, // exit. At the moment, we use a (cheap) hack for the common case where // the instruction of interest is the first one in the block. return !HeaderMayThrow || - Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; + &*Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; // If there is a path from header to exit or latch that doesn't lead to our // instruction's block, return false. diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 38f88850be0f1..264fedd6b66b9 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -8246,7 +8246,7 @@ static bool programUndefinedIfUndefOrPoison(const Value *V, if (!BB || !Visited.insert(BB).second) break; - Begin = BB->getFirstNonPHI()->getIterator(); + Begin = BB->getFirstNonPHIIt(); End = BB->end(); } return false; diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index 6d6432b61f2d7..97b4a6a42d81d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -928,8 +928,8 @@ void WinException::computeIP2StateTable( BaseState = NullState; StartLabel = Asm->getFunctionBegin(); } else { - auto *FuncletPad = - cast(FuncletStart->getBasicBlock()->getFirstNonPHI()); + auto *FuncletPad = cast( + FuncletStart->getBasicBlock()->getFirstNonPHIIt()); assert(FuncInfo.FuncletBaseStateMap.count(FuncletPad) != 0); BaseState = FuncInfo.FuncletBaseStateMap.find(FuncletPad)->second; StartLabel = getMCSymbolForMBB(Asm, &*FuncletStart); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index f668e41094bbc..21622ea43724c 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2866,7 +2866,7 @@ bool IRTranslator::findUnwindDestinations( } while (EHPadBB) { - const Instruction *Pad = EHPadBB->getFirstNonPHI(); + BasicBlock::const_iterator Pad = EHPadBB->getFirstNonPHIIt(); BasicBlock *NewEHPadBB = nullptr; if (isa(Pad)) { // Stop on landingpads. They are not funclets. @@ -2927,7 +2927,7 @@ bool IRTranslator::translateInvoke(const User &U, return false; // FIXME: support Windows exception handling. - if (!isa(EHPadBB->getFirstNonPHI())) + if (!isa(EHPadBB->getFirstNonPHIIt())) return false; // FIXME: support Windows dllimport function calls and calls through @@ -4031,7 +4031,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MF->push_back(EntryBB); EntryBuilder->setMBB(*EntryBB); - DebugLoc DbgLoc = F.getEntryBlock().getFirstNonPHI()->getDebugLoc(); + DebugLoc DbgLoc = F.getEntryBlock().getFirstNonPHIIt()->getDebugLoc(); SwiftError.setFunction(CurMF); SwiftError.createEntriesInEntryBlock(DbgLoc); diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 9d4547df046d4..7b76155b175d1 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -633,7 +633,7 @@ void GlobalMergeImpl::setMustKeepGlobalVariables(Module &M) { for (Function &F : M) { for (BasicBlock &BB : F) { - Instruction *Pad = BB.getFirstNonPHI(); + BasicBlock::iterator Pad = BB.getFirstNonPHIIt(); auto *II = dyn_cast(Pad); if (!Pad->isEHPad() && !(II && II->getIntrinsicID() == Intrinsic::eh_typeid_for)) diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index e4824183e8dfc..ab3609b6141b8 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -833,7 +833,8 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) { LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad); LP.LandingPadLabel = LandingPadLabel; - const Instruction *FirstI = LandingPad->getBasicBlock()->getFirstNonPHI(); + BasicBlock::const_iterator FirstI = + LandingPad->getBasicBlock()->getFirstNonPHIIt(); if (const auto *LPI = dyn_cast(FirstI)) { // If there's no typeid list specified, then "cleanup" is implicit. // Otherwise, id 0 is reserved for the cleanup action. diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index 57488a90e7a4a..b7600a3b7fba7 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -1217,7 +1217,7 @@ bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L, return true; OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", - L->getHeader()->getFirstNonPHI()); + &*L->getHeader()->getFirstNonPHIIt()); if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 3e89b18585f15..33c6341744478 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -250,7 +250,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Don't create MachineBasicBlocks for imaginary EH pad blocks. These blocks // are really data, and no instructions can live here. if (BB.isEHPad()) { - const Instruction *PadInst = BB.getFirstNonPHI(); + BasicBlock::const_iterator PadInst = BB.getFirstNonPHIIt(); // If this is a non-landingpad EH pad, mark this function as using // funclets. // FIXME: SEH catchpads do not create EH scope/funclets, so we could avoid @@ -261,13 +261,13 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, MF->getFrameInfo().setHasOpaqueSPAdjustment(true); } if (isa(PadInst)) { - assert(&*BB.begin() == PadInst && + assert(BB.begin() == PadInst && "WinEHPrepare failed to remove PHIs from imaginary BBs"); continue; } if (isa(PadInst) && Personality != EHPersonality::Wasm_CXX) - assert(&*BB.begin() == PadInst && "WinEHPrepare failed to demote PHIs"); + assert(BB.begin() == PadInst && "WinEHPrepare failed to demote PHIs"); } MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(&BB); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ecaa61fdc86a4..428e7a316d247 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2063,7 +2063,7 @@ static void findWasmUnwindDestinations( SmallVectorImpl> &UnwindDests) { while (EHPadBB) { - const Instruction *Pad = EHPadBB->getFirstNonPHI(); + BasicBlock::const_iterator Pad = EHPadBB->getFirstNonPHIIt(); if (isa(Pad)) { // Stop on cleanup pads. UnwindDests.emplace_back(FuncInfo.getMBB(EHPadBB), Prob); @@ -2111,7 +2111,7 @@ static void findUnwindDestinations( } while (EHPadBB) { - const Instruction *Pad = EHPadBB->getFirstNonPHI(); + BasicBlock::const_iterator Pad = EHPadBB->getFirstNonPHIIt(); BasicBlock *NewEHPadBB = nullptr; if (isa(Pad)) { // Stop on landingpads. They are not funclets. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 3b1abf7f3d994..899f83bbc6064 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1421,7 +1421,7 @@ bool SelectionDAGISel::PrepareEHLandingPad() { // Catchpads have one live-in register, which typically holds the exception // pointer or code. if (isFuncletEHPersonality(Pers)) { - if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHI())) { + if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHIIt())) { if (hasExceptionPointerOrCodeUser(CPI)) { // Get or create the virtual register to hold the pointer or code. Mark // the live in physreg and copy into the vreg. @@ -1452,7 +1452,7 @@ bool SelectionDAGISel::PrepareEHLandingPad() { MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask); if (Pers == EHPersonality::Wasm_CXX) { - if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHI())) + if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHIIt())) mapWasmLandingPadIndex(MBB, CPI); } else { // Assign the call site to the landing pad's begin label. @@ -1721,13 +1721,12 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // use anything def'd by or after the tail call. { BasicBlock::iterator BBStart = - const_cast(LLVMBB)->getFirstNonPHI()->getIterator(); + const_cast(LLVMBB)->getFirstNonPHIIt(); BasicBlock::iterator BBEnd = const_cast(LLVMBB)->end(); preserveFakeUses(BBStart, BBEnd); } - BasicBlock::const_iterator const Begin = - LLVMBB->getFirstNonPHI()->getIterator(); + BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHIIt(); BasicBlock::const_iterator const End = LLVMBB->end(); BasicBlock::const_iterator BI = End; diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp index 1701b0d04425d..d18196b2217f5 100644 --- a/llvm/lib/CodeGen/WasmEHPrepare.cpp +++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp @@ -227,7 +227,7 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) { for (BasicBlock &BB : F) { if (!BB.isEHPad()) continue; - auto *Pad = BB.getFirstNonPHI(); + BasicBlock::iterator Pad = BB.getFirstNonPHIIt(); if (isa(Pad)) CatchPads.push_back(&BB); else if (isa(Pad)) @@ -284,7 +284,7 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) { unsigned Index = 0; for (auto *BB : CatchPads) { - auto *CPI = cast(BB->getFirstNonPHI()); + auto *CPI = cast(BB->getFirstNonPHIIt()); // In case of a single catch (...), we don't need to emit a personalify // function call if (CPI->arg_size() == 1 && @@ -309,7 +309,7 @@ void WasmEHPrepareImpl::prepareEHPad(BasicBlock *BB, bool NeedPersonality, IRBuilder<> IRB(BB->getContext()); IRB.SetInsertPoint(BB, BB->getFirstInsertionPt()); - auto *FPI = cast(BB->getFirstNonPHI()); + auto *FPI = cast(BB->getFirstNonPHIIt()); Instruction *GetExnCI = nullptr, *GetSelectorCI = nullptr; for (auto &U : FPI->uses()) { if (auto *CI = dyn_cast(U.getUser())) { @@ -388,13 +388,13 @@ void llvm::calculateWasmEHInfo(const Function *F, WasmEHFuncInfo &EHInfo) { for (const auto &BB : *F) { if (!BB.isEHPad()) continue; - const Instruction *Pad = BB.getFirstNonPHI(); + const Instruction *Pad = &*BB.getFirstNonPHIIt(); if (const auto *CatchPad = dyn_cast(Pad)) { const auto *UnwindBB = CatchPad->getCatchSwitch()->getUnwindDest(); if (!UnwindBB) continue; - const Instruction *UnwindPad = UnwindBB->getFirstNonPHI(); + const Instruction *UnwindPad = &*UnwindBB->getFirstNonPHIIt(); if (const auto *CatchSwitch = dyn_cast(UnwindPad)) // Currently there should be only one handler per a catchswitch. EHInfo.setUnwindDest(&BB, *CatchSwitch->handlers().begin()); diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index c58c67b70fe3c..6d85f07829033 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -201,7 +201,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn, BasicBlock *FuncletUnwindDest; auto *FuncletPad = - dyn_cast(FuncletEntryBB->getFirstNonPHI()); + dyn_cast(FuncletEntryBB->getFirstNonPHIIt()); assert(FuncletPad || FuncletEntryBB == &Fn->getEntryBlock()); if (!FuncletPad) FuncletUnwindDest = nullptr; @@ -223,7 +223,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn, if (BaseState != -1) { FuncInfo.InvokeStateMap[II] = BaseState; } else { - Instruction *PadInst = InvokeUnwindDest->getFirstNonPHI(); + Instruction *PadInst = &*InvokeUnwindDest->getFirstNonPHIIt(); assert(FuncInfo.EHPadStateMap.count(PadInst) && "EH Pad has no state!"); FuncInfo.InvokeStateMap[II] = FuncInfo.EHPadStateMap[PadInst]; } @@ -254,10 +254,10 @@ void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State, if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State) continue; // skip blocks already visited by lower State - const llvm::Instruction *I = BB->getFirstNonPHI(); + BasicBlock::const_iterator It = BB->getFirstNonPHIIt(); const llvm::Instruction *TI = BB->getTerminator(); - if (I->isEHPad()) - State = EHInfo.EHPadStateMap[I]; + if (It->isEHPad()) + State = EHInfo.EHPadStateMap[&*It]; EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting if ((isa(TI) || isa(TI)) && State > 0) { @@ -315,15 +315,15 @@ void llvm::calculateSEHStateForAsynchEH(const BasicBlock *BB, int State, if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State) continue; // skip blocks already visited by lower State - const llvm::Instruction *I = BB->getFirstNonPHI(); + BasicBlock::const_iterator It = BB->getFirstNonPHIIt(); const llvm::Instruction *TI = BB->getTerminator(); - if (I->isEHPad()) - State = EHInfo.EHPadStateMap[I]; + if (It->isEHPad()) + State = EHInfo.EHPadStateMap[&*It]; EHInfo.BlockToStateMap[BB] = State; // Record state - if (isa(I) && isa(TI)) { + if (isa(It) && isa(TI)) { const Constant *FilterOrNull = cast( - cast(I)->getArgOperand(0)->stripPointerCasts()); + cast(It)->getArgOperand(0)->stripPointerCasts()); const Function *Filter = dyn_cast(FilterOrNull); if (!Filter || !Filter->getName().starts_with("__IsLocalUnwind")) State = EHInfo.SEHUnwindMap[State].ToState; // Retrive next State @@ -385,7 +385,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo, SmallVector Handlers; for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) { - auto *CatchPad = cast(CatchPadBB->getFirstNonPHI()); + auto *CatchPad = cast(CatchPadBB->getFirstNonPHIIt()); Handlers.push_back(CatchPad); } int TryLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr); @@ -393,7 +393,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo, for (const BasicBlock *PredBlock : predecessors(BB)) if ((PredBlock = getEHPadFromPredecessor(PredBlock, CatchSwitch->getParentPad()))) - calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + calculateCXXStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(), TryLow); int CatchLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr); @@ -456,7 +456,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo, for (const BasicBlock *PredBlock : predecessors(BB)) { if ((PredBlock = getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad()))) { - calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + calculateCXXStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(), CleanupState); } } @@ -509,7 +509,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo, assert(CatchSwitch->getNumHandlers() == 1 && "SEH doesn't have multiple handlers per __try"); const auto *CatchPad = - cast((*CatchSwitch->handler_begin())->getFirstNonPHI()); + cast((*CatchSwitch->handler_begin())->getFirstNonPHIIt()); const BasicBlock *CatchPadBB = CatchPad->getParent(); const Constant *FilterOrNull = cast(CatchPad->getArgOperand(0)->stripPointerCasts()); @@ -526,7 +526,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo, for (const BasicBlock *PredBlock : predecessors(BB)) if ((PredBlock = getEHPadFromPredecessor(PredBlock, CatchSwitch->getParentPad()))) - calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + calculateSEHStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(), TryState); // Everything in the __except block unwinds to ParentState, just like code @@ -562,7 +562,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo, for (const BasicBlock *PredBlock : predecessors(BB)) if ((PredBlock = getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad()))) - calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + calculateSEHStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(), CleanupState); for (const User *U : CleanupPad->users()) { const auto *UserI = cast(U); @@ -594,7 +594,7 @@ void llvm::calculateSEHStateNumbers(const Function *Fn, for (const BasicBlock &BB : *Fn) { if (!BB.isEHPad()) continue; - const Instruction *FirstNonPHI = BB.getFirstNonPHI(); + const Instruction *FirstNonPHI = &*BB.getFirstNonPHIIt(); if (!isTopLevelPadForMSVC(FirstNonPHI)) continue; ::calculateSEHStateNumbers(FuncInfo, FirstNonPHI, -1); @@ -618,7 +618,7 @@ void llvm::calculateWinCXXEHStateNumbers(const Function *Fn, for (const BasicBlock &BB : *Fn) { if (!BB.isEHPad()) continue; - const Instruction *FirstNonPHI = BB.getFirstNonPHI(); + const Instruction *FirstNonPHI = &*BB.getFirstNonPHIIt(); if (!isTopLevelPadForMSVC(FirstNonPHI)) continue; calculateCXXStateNumbers(FuncInfo, FirstNonPHI, -1); @@ -678,7 +678,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, // Seed a worklist with pads that have no parent. SmallVector, 8> Worklist; for (const BasicBlock &BB : *Fn) { - const Instruction *FirstNonPHI = BB.getFirstNonPHI(); + const Instruction *FirstNonPHI = &*BB.getFirstNonPHIIt(); const Value *ParentPad; if (const auto *CPI = dyn_cast(FirstNonPHI)) ParentPad = CPI->getParentPad(); @@ -725,7 +725,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, for (const BasicBlock *CatchBlock : llvm::reverse(CatchBlocks)) { // Create the entry for this catch with the appropriate handler // properties. - const auto *Catch = cast(CatchBlock->getFirstNonPHI()); + const auto *Catch = cast(CatchBlock->getFirstNonPHIIt()); uint32_t TypeToken = static_cast( cast(Catch->getArgOperand(0))->getZExtValue()); CatchState = @@ -751,7 +751,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, // so visit pads in descendant-most to ancestor-most order. for (ClrEHUnwindMapEntry &Entry : llvm::reverse(FuncInfo.ClrEHUnwindMap)) { const Instruction *Pad = - cast(Entry.Handler)->getFirstNonPHI(); + &*cast(Entry.Handler)->getFirstNonPHIIt(); // For most pads, the TryParentState is the state associated with the // unwind dest of exceptional exits from it. const BasicBlock *UnwindDest; @@ -800,7 +800,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, // Now we have an unwind dest for the user, but we need to see if it // unwinds all the way out of the cleanup or if it stays within it. - const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI(); + const Instruction *UserUnwindPad = &*UserUnwindDest->getFirstNonPHIIt(); const Value *UserUnwindParent; if (auto *CSI = dyn_cast(UserUnwindPad)) UserUnwindParent = CSI->getParentPad(); @@ -835,7 +835,8 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, if (!UnwindDest) { UnwindDestState = -1; } else { - UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()]; + UnwindDestState = + FuncInfo.EHPadStateMap[&*UnwindDest->getFirstNonPHIIt()]; } Entry.TryParentState = UnwindDestState; @@ -863,7 +864,8 @@ void WinEHPrepareImpl::demotePHIsOnFunclets(Function &F, for (BasicBlock &BB : make_early_inc_range(F)) { if (!BB.isEHPad()) continue; - if (DemoteCatchSwitchPHIOnly && !isa(BB.getFirstNonPHI())) + if (DemoteCatchSwitchPHIOnly && + !isa(BB.getFirstNonPHIIt())) continue; for (Instruction &I : make_early_inc_range(BB)) { @@ -898,7 +900,7 @@ void WinEHPrepareImpl::cloneCommonBlocks(Function &F) { if (FuncletPadBB == &F.getEntryBlock()) FuncletToken = ConstantTokenNone::get(F.getContext()); else - FuncletToken = FuncletPadBB->getFirstNonPHI(); + FuncletToken = &*FuncletPadBB->getFirstNonPHIIt(); std::vector> Orig2Clone; ValueToValueMapTy VMap; @@ -1094,7 +1096,7 @@ void WinEHPrepareImpl::removeImplausibleInstructions(Function &F) { for (auto &Funclet : FuncletBlocks) { BasicBlock *FuncletPadBB = Funclet.first; std::vector &BlocksInFunclet = Funclet.second; - Instruction *FirstNonPHI = FuncletPadBB->getFirstNonPHI(); + Instruction *FirstNonPHI = &*FuncletPadBB->getFirstNonPHIIt(); auto *FuncletPad = dyn_cast(FirstNonPHI); auto *CatchPad = dyn_cast_or_null(FuncletPad); auto *CleanupPad = dyn_cast_or_null(FuncletPad); @@ -1228,7 +1230,7 @@ bool WinEHPrepareImpl::prepareExplicitEH(Function &F) { AllocaInst *WinEHPrepareImpl::insertPHILoads(PHINode *PN, Function &F) { BasicBlock *PHIBlock = PN->getParent(); AllocaInst *SpillSlot = nullptr; - Instruction *EHPad = PHIBlock->getFirstNonPHI(); + Instruction *EHPad = &*PHIBlock->getFirstNonPHIIt(); if (!EHPad->isTerminator()) { // If the EHPad isn't a terminator, then we can insert a load in this block @@ -1303,7 +1305,7 @@ void WinEHPrepareImpl::insertPHIStore( BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot, SmallVectorImpl> &Worklist) { - if (PredBlock->isEHPad() && PredBlock->getFirstNonPHI()->isTerminator()) { + if (PredBlock->isEHPad() && PredBlock->getFirstNonPHIIt()->isTerminator()) { // Pred is unsplittable, so we need to queue it on the worklist. Worklist.push_back({PredBlock, PredVal}); return; diff --git a/llvm/lib/IR/EHPersonalities.cpp b/llvm/lib/IR/EHPersonalities.cpp index 7c32601b8a83e..575130bff7a34 100644 --- a/llvm/lib/IR/EHPersonalities.cpp +++ b/llvm/lib/IR/EHPersonalities.cpp @@ -129,7 +129,7 @@ DenseMap llvm::colorEHFunclets(Function &F) { DEBUG_WITH_TYPE("win-eh-prepare-coloring", dbgs() << "Visiting " << Visiting->getName() << ", " << Color->getName() << "\n"); - Instruction *VisitingHead = Visiting->getFirstNonPHI(); + BasicBlock::iterator VisitingHead = Visiting->getFirstNonPHIIt(); if (VisitingHead->isEHPad()) { // Mark this funclet head as a member of itself. Color = Visiting; diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b585d8cfbf2e2..c9f5807765e40 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -832,7 +832,7 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef OpB, } LandingPadInst *InvokeInst::getLandingPadInst() const { - return cast(getUnwindDest()->getFirstNonPHI()); + return cast(getUnwindDest()->getFirstNonPHIIt()); } void InvokeInst::updateProfWeight(uint64_t S, uint64_t T) { diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 00280dbe5300b..54de812517438 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2726,7 +2726,7 @@ static Instruction *getSuccPad(Instruction *Terminator) { UnwindDest = CSI->getUnwindDest(); else UnwindDest = cast(Terminator)->getUnwindDest(); - return UnwindDest->getFirstNonPHI(); + return &*UnwindDest->getFirstNonPHIIt(); } void Verifier::verifySiblingFuncletUnwinds() { @@ -4585,7 +4585,7 @@ void Verifier::visitCatchPadInst(CatchPadInst &CPI) { // The catchpad instruction must be the first non-PHI instruction in the // block. - Check(BB->getFirstNonPHI() == &CPI, + Check(&*BB->getFirstNonPHIIt() == &CPI, "CatchPadInst not the first non-PHI instruction in the block.", &CPI); visitEHPadPredecessors(CPI); @@ -4609,7 +4609,7 @@ void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) { // The cleanuppad instruction must be the first non-PHI instruction in the // block. - Check(BB->getFirstNonPHI() == &CPI, + Check(&*BB->getFirstNonPHIIt() == &CPI, "CleanupPadInst not the first non-PHI instruction in the block.", &CPI); auto *ParentPad = CPI.getParentPad(); @@ -4664,7 +4664,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) { Value *UnwindPad; bool ExitsFPI; if (UnwindDest) { - UnwindPad = UnwindDest->getFirstNonPHI(); + UnwindPad = &*UnwindDest->getFirstNonPHIIt(); if (!cast(UnwindPad)->isEHPad()) continue; Value *UnwindParent = getParentPad(UnwindPad); @@ -4767,7 +4767,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) { BasicBlock *SwitchUnwindDest = CatchSwitch->getUnwindDest(); Value *SwitchUnwindPad; if (SwitchUnwindDest) - SwitchUnwindPad = SwitchUnwindDest->getFirstNonPHI(); + SwitchUnwindPad = &*SwitchUnwindDest->getFirstNonPHIIt(); else SwitchUnwindPad = ConstantTokenNone::get(FPI.getContext()); Check(SwitchUnwindPad == FirstUnwindPad, @@ -4790,7 +4790,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) { // The catchswitch instruction must be the first non-PHI instruction in the // block. - Check(BB->getFirstNonPHI() == &CatchSwitch, + Check(&*BB->getFirstNonPHIIt() == &CatchSwitch, "CatchSwitchInst not the first non-PHI instruction in the block.", &CatchSwitch); @@ -4799,14 +4799,14 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) { "CatchSwitchInst has an invalid parent.", ParentPad); if (BasicBlock *UnwindDest = CatchSwitch.getUnwindDest()) { - Instruction *I = UnwindDest->getFirstNonPHI(); + BasicBlock::iterator I = UnwindDest->getFirstNonPHIIt(); Check(I->isEHPad() && !isa(I), "CatchSwitchInst must unwind to an EH block which is not a " "landingpad.", &CatchSwitch); // Record catchswitch sibling unwinds for verifySiblingFuncletUnwinds - if (getParentPad(I) == ParentPad) + if (getParentPad(&*I) == ParentPad) SiblingFuncletInfo[&CatchSwitch] = &CatchSwitch; } @@ -4814,7 +4814,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) { "CatchSwitchInst cannot have empty handler list", &CatchSwitch); for (BasicBlock *Handler : CatchSwitch.handlers()) { - Check(isa(Handler->getFirstNonPHI()), + Check(isa(Handler->getFirstNonPHIIt()), "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler); } @@ -4828,7 +4828,7 @@ void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) { CRI.getOperand(0)); if (BasicBlock *UnwindDest = CRI.getUnwindDest()) { - Instruction *I = UnwindDest->getFirstNonPHI(); + BasicBlock::iterator I = UnwindDest->getFirstNonPHIIt(); Check(I->isEHPad() && !isa(I), "CleanupReturnInst must unwind to an EH block which is not a " "landingpad.", diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp index 1b2558d2e4009..afc47968bf657 100644 --- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp +++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp @@ -222,7 +222,7 @@ bool BPFAdjustOptImpl::serializeICMPCrossBB(BasicBlock &BB) { if (!BI || !BI->isConditional()) return false; auto *Cond = dyn_cast(BI->getCondition()); - if (!Cond || B2->getFirstNonPHI() != Cond) + if (!Cond || &*B2->getFirstNonPHIIt() != Cond) return false; Value *B2Op0 = Cond->getOperand(0); auto Cond2Op = Cond->getPredicate(); diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 991ee5b1cbaa5..d2ae2ef7bd7ff 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1097,8 +1097,7 @@ bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB, promoteTo(In, DestTy, LoopB); // Fix up the PHI nodes in the exit block. - Instruction *EndI = ExitB->getFirstNonPHI(); - BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end(); + BasicBlock::iterator End = ExitB->getFirstNonPHIIt(); for (auto I = ExitB->begin(); I != End; ++I) { PHINode *P = dyn_cast(I); if (!P) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 839a206033a0c..c60cf69c30104 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -1199,7 +1199,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) { // Look for orphan landingpads, can occur in blocks with no predecessors for (BasicBlock &BB : F) { - Instruction *I = BB.getFirstNonPHI(); + BasicBlock::iterator I = BB.getFirstNonPHIIt(); if (auto *LPI = dyn_cast(I)) LandingPads.insert(LPI); } @@ -1739,7 +1739,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj( SmallVector ToErase; for (auto &BB : F) { - if (auto *CSI = dyn_cast(BB.getFirstNonPHI())) { + if (auto *CSI = dyn_cast(BB.getFirstNonPHIIt())) { if (CSI != CatchSwitchLongjmp && CSI->unwindsToCaller()) { IRB.SetInsertPoint(CSI); ToErase.push_back(CSI); diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 35b7d7f508b02..7d6d3f8d21f25 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -513,7 +513,7 @@ int WinEHStatePass::getBaseStateForBB( assert(BBColors.size() == 1 && "multi-color BB not removed by preparation"); BasicBlock *FuncletEntryBB = BBColors.front(); if (auto *FuncletPad = - dyn_cast(FuncletEntryBB->getFirstNonPHI())) { + dyn_cast(FuncletEntryBB->getFirstNonPHIIt())) { auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad); if (BaseStateI != FuncInfo.FuncletBaseStateMap.end()) BaseState = BaseStateI->second; @@ -741,7 +741,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { for (BasicBlock *BB : RPOT) { auto &BBColors = BlockColors[BB]; BasicBlock *FuncletEntryBB = BBColors.front(); - if (isa(FuncletEntryBB->getFirstNonPHI())) + if (isa(FuncletEntryBB->getFirstNonPHIIt())) continue; int PrevState = getPredState(FinalStates, F, ParentBaseState, BB); @@ -783,7 +783,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { for (CallBase *Call : SetJmp3Calls) { auto &BBColors = BlockColors[Call->getParent()]; BasicBlock *FuncletEntryBB = BBColors.front(); - bool InCleanup = isa(FuncletEntryBB->getFirstNonPHI()); + bool InCleanup = isa(FuncletEntryBB->getFirstNonPHIIt()); IRBuilder<> Builder(Call); Value *State; diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp index cc462011a6242..3686c7c153999 100644 --- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp @@ -180,12 +180,12 @@ static void rewriteMaterializableInstructions( // insert the remats into the end of the predecessor (there should only be // one). This is so that suspend blocks always have the suspend instruction // as the first instruction. - auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); + BasicBlock::iterator InsertPoint = Use->getParent()->getFirstInsertionPt(); if (isa(Use)) { BasicBlock *SuspendPredecessorBlock = Use->getParent()->getSinglePredecessor(); assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); - InsertPoint = SuspendPredecessorBlock->getTerminator(); + InsertPoint = SuspendPredecessorBlock->getTerminator()->getIterator(); } // Note: skip the first instruction as this is the actual use that we're @@ -197,7 +197,7 @@ static void rewriteMaterializableInstructions( CurrentMaterialization = D->clone(); CurrentMaterialization->setName(D->getName()); CurrentMaterialization->insertBefore(InsertPoint); - InsertPoint = CurrentMaterialization; + InsertPoint = CurrentMaterialization->getIterator(); // Replace all uses of Def in the instructions being added as part of this // rematerialization group diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index 6cc218e63a012..3f54106bd09fe 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -1754,7 +1754,7 @@ findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region, // If we've made it here, it means we weren't able to replace the PHINode, so // we must insert it ourselves. PHINode *NewPN = cast(PN.clone()); - NewPN->insertBefore(&*OverallPhiBlock->begin()); + NewPN->insertBefore(OverallPhiBlock->begin()); for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx; Idx++) { Value *IncomingVal = NewPN->getIncomingValue(Idx); diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 28c81465a0948..cead7b84c3fc8 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -1039,7 +1039,7 @@ void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { }; ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( - ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); + ClonedOI->ReturnBlock->getFirstNonPHIIt()); BasicBlock::iterator I = PreReturn->begin(); BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin(); SmallVector DeadPhis; diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index cca6f78084b46..e5f3b7f24bca7 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -689,11 +689,11 @@ class RuntimeCallInserter { } BasicBlock *Color = Colors.front(); - Instruction *EHPad = Color->getFirstNonPHI(); + BasicBlock::iterator EHPadIt = Color->getFirstNonPHIIt(); - if (EHPad && EHPad->isEHPad()) { + if (EHPadIt != Color->end() && EHPadIt->isEHPad()) { // Replace CI with a clone with an added funclet OperandBundle - OperandBundleDef OB("funclet", EHPad); + OperandBundleDef OB("funclet", &*EHPadIt); auto *NewCall = CallBase::addOperandBundle(CI, LLVMContext::OB_funclet, OB, CI->getIterator()); NewCall->copyMetadata(*CI); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 8e9b85c8d6857..56d3eb10d73e9 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1205,8 +1205,9 @@ struct MemorySanitizerVisitor : public InstVisitor { removeUnreachableBlocks(F); MS.initializeCallbacks(*F.getParent(), TLI); - FnPrologueEnd = IRBuilder<>(F.getEntryBlock().getFirstNonPHI()) - .CreateIntrinsic(Intrinsic::donothing, {}, {}); + FnPrologueEnd = + IRBuilder<>(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt()) + .CreateIntrinsic(Intrinsic::donothing, {}, {}); if (MS.CompileKernel) { IRBuilder<> IRB(FnPrologueEnd); diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 57e39c4eae966..d396dbf75eebc 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -760,7 +760,7 @@ void NumericalStabilitySanitizer::createShadowArguments( })) return; - IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHI()); + IRBuilder<> Builder(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt()); // The function has shadow args if the shadow args tag matches the function // address. Value *HasShadowArgs = Builder.CreateICmpEQ( diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index db4d62ec36751..5ad07e83d1273 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -910,9 +910,9 @@ populateEHOperandBundle(VPCandidateInfo &Cand, if (!BlockColors.empty()) { const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second; assert(CV.size() == 1 && "non-unique color for block!"); - Instruction *EHPad = CV.front()->getFirstNonPHI(); - if (EHPad->isEHPad()) - OpBundles.emplace_back("funclet", EHPad); + BasicBlock::iterator EHPadIt = CV.front()->getFirstNonPHIIt(); + if (EHPadIt->isEHPad()) + OpBundles.emplace_back("funclet", &*EHPadIt); } } } diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index dc51c564fbe0d..f6780c0f06b18 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -386,7 +386,7 @@ bool MemOPSizeOpt::perform(MemOp MO) { PHINode *PHI = nullptr; if (!MemOpTy->isVoidTy()) { // Insert a phi for the return values at the merge block. - IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); + IRBuilder<> IRBM(MergeBB, MergeBB->getFirstNonPHIIt()); PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); MO.I->replaceAllUsesWith(PHI); PHI->addIncoming(MO.I, DefaultBB); diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index e0070e583b681..7deaac5e59a28 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -479,7 +479,8 @@ static bool isTsanAtomic(const Instruction *I) { } void ThreadSanitizer::InsertRuntimeIgnores(Function &F) { - InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); + InstrumentationIRBuilder IRB(&F.getEntryBlock(), + F.getEntryBlock().getFirstNonPHIIt()); IRB.CreateCall(TsanIgnoreBegin); EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions); while (IRBuilder<> *AtExit = EE.Next()) { @@ -569,7 +570,8 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, // Instrument function entry/exit points if there were instrumented accesses. if ((Res || HasCalls) && ClInstrumentFuncEntryExit) { - InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); + InstrumentationIRBuilder IRB(&F.getEntryBlock(), + F.getEntryBlock().getFirstNonPHIIt()); Value *ReturnAddress = IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp index 33870d7ea192a..b6ade1c29a2b5 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp @@ -32,9 +32,9 @@ CallInst *objcarc::createCallInstWithColors( if (!BlockColors.empty()) { const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second; assert(CV.size() == 1 && "non-unique color for block!"); - Instruction *EHPad = CV.front()->getFirstNonPHI(); + BasicBlock::iterator EHPad = CV.front()->getFirstNonPHIIt(); if (EHPad->isEHPad()) - OpBundles.emplace_back("funclet", EHPad); + OpBundles.emplace_back("funclet", &*EHPad); } return CallInst::Create(FTy, Callee, Args, OpBundles, NameStr, InsertBefore); diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index b020591c203db..8407726a69c0b 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -627,7 +627,7 @@ bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) { // block with a catchswitch has no insertion point. Keep going up // the dominator tree until we find a non-catchswitch. BasicBlock *InsertBB = IncomingBB; - while (isa(InsertBB->getFirstNonPHI())) { + while (isa(InsertBB->getFirstNonPHIIt())) { InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock(); } diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 340d55190a5e6..9d7f5e64f9868 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -583,7 +583,8 @@ class ObjCARCOpt { const ColorVector &CV = BlockEHColors.find(BB)->second; assert(CV.size() > 0 && "Uncolored block"); for (BasicBlock *EHPadBB : CV) - if (auto *EHPad = dyn_cast(EHPadBB->getFirstNonPHI())) { + if (auto *EHPad = + dyn_cast(EHPadBB->getFirstNonPHIIt())) { OpBundles.emplace_back("funclet", EHPad); return; } diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index def4add46e5ba..21eb7f741d7c8 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1720,7 +1720,8 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, // to speculatively execute the load at that points. if (MustEnsureSafetyOfSpeculativeExecution) { if (CriticalEdgePredSplit.size()) - if (!isSafeToSpeculativelyExecute(Load, LoadBB->getFirstNonPHI(), AC, DT)) + if (!isSafeToSpeculativelyExecute(Load, &*LoadBB->getFirstNonPHIIt(), AC, + DT)) return false; for (auto &PL : PredLoads) if (!isSafeToSpeculativelyExecute(Load, PL.first->getTerminator(), AC, diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 730f5cd0f8d0d..6651281ff2d01 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -906,7 +906,7 @@ void GVNSink::sinkLastInstruction(ArrayRef Blocks, // and move it to the start of the successor block. for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) I0->getOperandUse(O).set(NewOperands[O]); - I0->moveBefore(&*BBEnd->getFirstInsertionPt()); + I0->moveBefore(BBEnd->getFirstInsertionPt()); // Update metadata and IR flags. for (auto *I : Insts) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 658187ed74505..1a65154ae5936 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1448,9 +1448,9 @@ static Instruction *cloneInstructionInExitBlock( const ColorVector &CV = BlockColors.find(&ExitBlock)->second; assert(CV.size() == 1 && "non-unique color for exit block!"); BasicBlock *BBColor = CV.front(); - Instruction *EHPad = BBColor->getFirstNonPHI(); + BasicBlock::iterator EHPad = BBColor->getFirstNonPHIIt(); if (EHPad->isEHPad()) - OpBundles.emplace_back("funclet", EHPad); + OpBundles.emplace_back("funclet", &*EHPad); } New = CallInst::Create(CI, OpBundles); @@ -1549,7 +1549,8 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { // it require updating BlockColors for all offspring blocks accordingly. By // skipping such corner case, we can make updating BlockColors after splitting // predecessor fairly simple. - if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad()) + if (!SafetyInfo->getBlockColors().empty() && + BB->getFirstNonPHIIt()->isEHPad()) return false; for (BasicBlock *BBPred : predecessors(BB)) { if (isa(BBPred->getTerminator())) diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 3c82eeda54838..c5091e731444e 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1641,8 +1641,8 @@ static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, // plus "cnt0". Currently it is not optimized. // This step could be used to detect POPCNT instruction: // cnt.next = cnt + (x.next & 1) - for (Instruction &Inst : llvm::make_range( - LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + for (Instruction &Inst : + llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) { if (Inst.getOpcode() != Instruction::Add) continue; @@ -1745,8 +1745,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { CountInst = nullptr; - for (Instruction &Inst : llvm::make_range( - LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + for (Instruction &Inst : + llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) { if (Inst.getOpcode() != Instruction::Add) continue; @@ -1869,8 +1869,8 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, // plus "cnt0". Currently it is not optimized. // This step could be used to detect POPCNT instruction: // cnt.next = cnt + (x.next & 1) - for (Instruction &Inst : llvm::make_range( - LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + for (Instruction &Inst : + llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) { if (Inst.getOpcode() != Instruction::Add) continue; diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index ed80040aa4236..38fc682698c53 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -1350,7 +1350,7 @@ bool LoopInterchangeTransform::transform() { // Duplicate instruction and move it the new latch. Update uses that // have been moved. Instruction *NewI = WorkList[i]->clone(); - NewI->insertBefore(NewLatch->getFirstNonPHI()); + NewI->insertBefore(NewLatch->getFirstNonPHIIt()); assert(!NewI->mayHaveSideEffects() && "Moving instructions with side-effects may change behavior of " "the loop nest!"); @@ -1388,8 +1388,9 @@ bool LoopInterchangeTransform::transform() { // Ensure the inner loop phi nodes have a separate basic block. BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); - if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) { - SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + if (&*InnerLoopHeader->getFirstNonPHIIt() != + InnerLoopHeader->getTerminator()) { + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI); LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); } @@ -1526,12 +1527,12 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, // InnerLatch, which will become the new exit block for the innermost // loop after interchanging. for (PHINode *P : LcssaInnerExit) - P->moveBefore(InnerLatch->getFirstNonPHI()); + P->moveBefore(InnerLatch->getFirstNonPHIIt()); // If the inner loop latch contains LCSSA PHIs, those come from a child loop // and we have to move them to the new inner latch. for (PHINode *P : LcssaInnerLatch) - P->moveBefore(InnerExit->getFirstNonPHI()); + P->moveBefore(InnerExit->getFirstNonPHIIt()); // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have // incoming values defined in the outer loop, we have to add a new PHI @@ -1557,7 +1558,7 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, continue; NewPhi->addIncoming(P.getIncomingValue(0), Pred); } - NewPhi->insertBefore(InnerLatch->getFirstNonPHI()); + NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt()); P.setIncomingValue(0, NewPhi); } } @@ -1697,12 +1698,12 @@ bool LoopInterchangeTransform::adjustLoopBranches() { // outer loop and all the remains to do is and updating the incoming blocks. for (PHINode *PHI : OuterLoopPHIs) { LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump();); - PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); + PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt()); assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); } for (PHINode *PHI : InnerLoopPHIs) { LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump();); - PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); + PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt()); assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); } diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index ae9103d0608a1..765b76e54068c 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -365,7 +365,7 @@ class ConstantTerminatorFoldingImpl { for (auto &PN : BB->phis()) DeadInstructions.push_back(&PN); - if (auto *LandingPad = dyn_cast(BB->getFirstNonPHI())) + if (auto *LandingPad = dyn_cast(BB->getFirstNonPHIIt())) DeadInstructions.emplace_back(LandingPad); for (Instruction *I : DeadInstructions) { diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 7ec1949c1c10f..8be2f78187a0c 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -102,7 +102,7 @@ static bool runSCCP(Function &F, const DataLayout &DL, // Remove unreachable blocks and non-feasible edges. for (BasicBlock *DeadBB : BlocksToErase) - NumInstRemoved += changeToUnreachable(DeadBB->getFirstNonPHI(), + NumInstRemoved += changeToUnreachable(&*DeadBB->getFirstNonPHIIt(), /*PreserveLCSSA=*/false, &DTU); BasicBlock *NewUnreachableBB = nullptr; diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 4606514cbc717..62b4b545f29bb 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -366,8 +366,8 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F, continue; // Don't even think about ehpads/landingpads. - Instruction *FirstNonPHI = Target->getFirstNonPHI(); - if (FirstNonPHI->isEHPad() || Target->isLandingPad()) + auto FirstNonPHIIt = Target->getFirstNonPHIIt(); + if (FirstNonPHIIt->isEHPad() || Target->isLandingPad()) continue; // Remember edge probabilities if needed. @@ -380,7 +380,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F, BPI->eraseBlock(Target); } - BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); + BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHIIt, ".split"); if (ShouldUpdateAnalysis) { // Copy the BFI/BPI from Target to BodyBlock. BPI->setEdgeProbability(BodyBlock, EdgeProbabilities); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 33b3e4aea12d3..526132f5e5332 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -410,8 +410,8 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) { assert(!getFirstPHI(CommonExitBlock) && "Phi not expected"); #endif - BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock( - CommonExitBlock->getFirstNonPHI()->getIterator()); + BasicBlock *NewExitBlock = + CommonExitBlock->splitBasicBlock(CommonExitBlock->getFirstNonPHIIt()); for (BasicBlock *Pred : llvm::make_early_inc_range(predecessors(CommonExitBlock))) { @@ -701,7 +701,7 @@ void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) { // containing PHI nodes merging values from outside of the region, and a // second that contains all of the code for the block and merges back any // incoming values from inside of the region. - BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHI(), DT); + BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHIIt(), DT); // We only want to code extract the second block now, and it becomes the new // header of the region. diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index aa5e04d71657a..6a8a468ebcae2 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -276,7 +276,7 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad, Value *UnwindDestToken = nullptr; if (auto *CatchSwitch = dyn_cast(CurrentPad)) { if (CatchSwitch->hasUnwindDest()) { - UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI(); + UnwindDestToken = &*CatchSwitch->getUnwindDest()->getFirstNonPHIIt(); } else { // Catchswitch doesn't have a 'nounwind' variant, and one might be // annotated as "unwinds to caller" when really it's nounwind (see @@ -288,7 +288,8 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad, HE = CatchSwitch->handler_end(); HI != HE && !UnwindDestToken; ++HI) { BasicBlock *HandlerBlock = *HI; - auto *CatchPad = cast(HandlerBlock->getFirstNonPHI()); + auto *CatchPad = + cast(&*HandlerBlock->getFirstNonPHIIt()); for (User *Child : CatchPad->users()) { // Intentionally ignore invokes here -- since the catchswitch is // marked "unwind to caller", it would be a verifier error if it @@ -326,14 +327,14 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad, for (User *U : CleanupPad->users()) { if (auto *CleanupRet = dyn_cast(U)) { if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest()) - UnwindDestToken = RetUnwindDest->getFirstNonPHI(); + UnwindDestToken = &*RetUnwindDest->getFirstNonPHIIt(); else UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext()); break; } Value *ChildUnwindDestToken; if (auto *Invoke = dyn_cast(U)) { - ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI(); + ChildUnwindDestToken = &*Invoke->getUnwindDest()->getFirstNonPHIIt(); } else if (isa(U) || isa(U)) { Instruction *ChildPad = cast(U); auto Memo = MemoMap.find(ChildPad); @@ -522,14 +523,13 @@ static Value *getUnwindDestToken(Instruction *EHPad, if (auto *CatchSwitch = dyn_cast(UselessPad)) { assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad"); for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) { - auto *CatchPad = HandlerBlock->getFirstNonPHI(); + auto *CatchPad = &*HandlerBlock->getFirstNonPHIIt(); for (User *U : CatchPad->users()) { - assert( - (!isa(U) || - (getParentPad( - cast(U)->getUnwindDest()->getFirstNonPHI()) == - CatchPad)) && - "Expected useless pad"); + assert((!isa(U) || + (getParentPad(&*cast(U) + ->getUnwindDest() + ->getFirstNonPHIIt()) == CatchPad)) && + "Expected useless pad"); if (isa(U) || isa(U)) Worklist.push_back(cast(U)); } @@ -538,11 +538,12 @@ static Value *getUnwindDestToken(Instruction *EHPad, assert(isa(UselessPad)); for (User *U : UselessPad->users()) { assert(!isa(U) && "Expected useless pad"); - assert((!isa(U) || - (getParentPad( - cast(U)->getUnwindDest()->getFirstNonPHI()) == - UselessPad)) && - "Expected useless pad"); + assert( + (!isa(U) || + (getParentPad( + &*cast(U)->getUnwindDest()->getFirstNonPHIIt()) == + UselessPad)) && + "Expected useless pad"); if (isa(U) || isa(U)) Worklist.push_back(cast(U)); } @@ -678,7 +679,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, BasicBlock *UnwindDest = II->getUnwindDest(); Function *Caller = FirstNewBlock->getParent(); - assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!"); + assert(UnwindDest->getFirstNonPHIIt()->isEHPad() && "unexpected BasicBlock!"); // If there are PHI nodes in the unwind destination block, we need to keep // track of which values came into them from the invoke before removing the @@ -723,7 +724,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, } } - Instruction *I = BB->getFirstNonPHI(); + BasicBlock::iterator I = BB->getFirstNonPHIIt(); if (!I->isEHPad()) continue; @@ -772,7 +773,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, } if (Replacement) { - Replacement->takeName(I); + Replacement->takeName(&*I); I->replaceAllUsesWith(Replacement); I->eraseFromParent(); UpdatePHINodes(&*BB); @@ -2288,7 +2289,7 @@ remapIndices(Function &Caller, BasicBlock *StartBB, // this may be the entryblock from the inlined callee, coming into a BB // that didn't have instrumentation because of MST decisions. Let's make // sure it's placed accordingly. This is a noop elsewhere. - BBID->moveBefore(&*BB->getFirstInsertionPt()); + BBID->moveBefore(BB->getFirstInsertionPt()); } for (auto &I : llvm::make_early_inc_range(*BB)) { if (auto *Inc = dyn_cast(&I)) { @@ -2581,7 +2582,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Ok, the call site is within a cleanuppad. Let's check the callee // for catchpads. for (const BasicBlock &CalledBB : *CalledFunc) { - if (isa(CalledBB.getFirstNonPHI())) + if (isa(CalledBB.getFirstNonPHIIt())) return InlineResult::failure("catch in cleanup funclet"); } } @@ -3029,7 +3030,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // rewriting the "parent pad" links. if (auto *II = dyn_cast(&CB)) { BasicBlock *UnwindDest = II->getUnwindDest(); - Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); + BasicBlock::iterator FirstNonPHI = UnwindDest->getFirstNonPHIIt(); if (isa(FirstNonPHI)) { HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); } else { @@ -3055,7 +3056,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally) changeToUnreachable(CleanupRet); - Instruction *I = BB->getFirstNonPHI(); + BasicBlock::iterator I = BB->getFirstNonPHIIt(); if (!I->isEHPad()) continue; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 94cf1185bc2cb..d5cf62e52cca3 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -2108,7 +2108,7 @@ insertDbgVariableRecordsForPHIs(BasicBlock *BB, for (auto PHI : InsertedPHIs) { BasicBlock *Parent = PHI->getParent(); // Avoid inserting a debug-info record into an EH block. - if (Parent->getFirstNonPHI()->isEHPad()) + if (Parent->getFirstNonPHIIt()->isEHPad()) continue; for (auto VI : PHI->operand_values()) { auto V = DbgValueMap.find(VI); @@ -2174,7 +2174,7 @@ void llvm::insertDebugValuesForPHIs(BasicBlock *BB, for (auto *PHI : InsertedPHIs) { BasicBlock *Parent = PHI->getParent(); // Avoid inserting an intrinsic into an EH block. - if (Parent->getFirstNonPHI()->isEHPad()) + if (Parent->getFirstNonPHIIt()->isEHPad()) continue; for (auto *VI : PHI->operand_values()) { auto V = DbgValueMap.find(VI); @@ -3206,7 +3206,7 @@ static bool markAliveBlocks(Function &F, BasicBlock *HandlerBB = *I; if (DTU) ++NumPerSuccessorCases[HandlerBB]; - auto *CatchPad = cast(HandlerBB->getFirstNonPHI()); + auto *CatchPad = cast(HandlerBB->getFirstNonPHIIt()); if (!HandlerSet.insert({CatchPad, Empty}).second) { if (DTU) --NumPerSuccessorCases[HandlerBB]; diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index b3f9f76274d30..61ffb49a8c010 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -382,7 +382,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(), Header->getName() + ".backedge", F); BranchInst *BETerminator = BranchInst::Create(Header, BEBlock); - BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc()); + BETerminator->setDebugLoc(Header->getFirstNonPHIIt()->getDebugLoc()); LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block " << BEBlock->getName() << "\n"); diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 91291b429ea43..dbab56a6996ce 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -109,8 +109,9 @@ void llvm::createMemCpyLoopKnownSize( uint64_t BytesCopied = LoopEndCount; uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied; if (RemainingBytes) { - IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI() - : InsertBefore); + BasicBlock::iterator InsertIt = PostLoopBB ? PostLoopBB->getFirstNonPHIIt() + : InsertBefore->getIterator(); + IRBuilder<> RBuilder(InsertIt->getParent(), InsertIt); SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, @@ -735,14 +736,16 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore, // the same way, except that we change the IRBuilder insert point for each // load/store pair so that each one is inserted before the previous one // instead of after it. - IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI()); + IRBuilder<> BwdResBuilder(CopyBackwardsBB, + CopyBackwardsBB->getFirstNonPHIIt()); SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, SrcAS, DstAS, PartSrcAlign, PartDstAlign); for (auto *OpTy : RemainingOps) { // reverse the order of the emitted operations - BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI()); + BwdResBuilder.SetInsertPoint(CopyBackwardsBB, + CopyBackwardsBB->getFirstNonPHIIt()); GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied); } } diff --git a/llvm/lib/Transforms/Utils/MoveAutoInit.cpp b/llvm/lib/Transforms/Utils/MoveAutoInit.cpp index 9b1b09bb3d8f2..ad105f5a57b49 100644 --- a/llvm/lib/Transforms/Utils/MoveAutoInit.cpp +++ b/llvm/lib/Transforms/Utils/MoveAutoInit.cpp @@ -179,7 +179,7 @@ static bool runMoveAutoInit(Function &F, DominatorTree &DT, MemorySSA &MSSA) { // CatchSwitchInst blocks can only have one instruction, so they are not // good candidates for insertion. - while (isa(UsersDominator->getFirstNonPHI())) { + while (isa(UsersDominator->getFirstNonPHIIt())) { for (BasicBlock *Pred : predecessors(UsersDominator)) if (DT.isReachableFromEntry(Pred)) UsersDominator = DT.findNearestCommonDominator(UsersDominator, Pred); diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 229b1d9f07f8c..48d9528f0c3df 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -173,8 +173,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // Set the DebugLoc of the inserted PHI, if available. DebugLoc DL; - if (const Instruction *I = BB->getFirstNonPHI()) - DL = I->getDebugLoc(); + if (BasicBlock::iterator It = BB->getFirstNonPHIIt(); It != BB->end()) + DL = It->getDebugLoc(); InsertedPHI->setDebugLoc(DL); // If the client wants to know about all new instructions, tell it. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e4e87704c1c97..49694eb68e25b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7959,7 +7959,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( PhisInBlock.push_back(&Phi); for (PHINode *Phi : PhisInBlock) { - Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); + Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); @@ -10291,8 +10291,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, // VPReductionPHIRecipes for AnyOf reductions expect a boolean as // start value; compare the final value from the main vector loop // to the start value. - IRBuilder<> Builder( - cast(ResumeV)->getParent()->getFirstNonPHI()); + BasicBlock *PBB = cast(ResumeV)->getParent(); + IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt()); ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 9d973d200662d..4159a71469bd1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -316,10 +316,9 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { // last PHI, if LastInst is a PHI. This ensures the insertelement sequence // will directly follow the scalar definitions. auto OldIP = Builder.saveIP(); - auto NewIP = - isa(LastInst) - ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) - : std::next(BasicBlock::iterator(LastInst)); + auto NewIP = isa(LastInst) + ? LastInst->getParent()->getFirstNonPHIIt() + : std::next(BasicBlock::iterator(LastInst)); Builder.SetInsertPoint(&*NewIP); // However, if we are vectorizing, we need to construct the vector values. diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp index 41e3ffd963f5b..da363df77d0c0 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp @@ -52,7 +52,7 @@ static void replaceBranchTerminator(BasicBlock &BB, bool IsBranch = isa(Term); if (InvokeInst *Invoke = dyn_cast(Term)) { BasicBlock *UnwindDest = Invoke->getUnwindDest(); - Instruction *LP = UnwindDest->getFirstNonPHI(); + BasicBlock::iterator LP = UnwindDest->getFirstNonPHIIt(); // Remove landingpad instruction if the containing block isn't used by other // invokes. diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp index 1fb3f46b9240f..ad4393ccd5315 100644 --- a/llvm/unittests/Analysis/MemorySSATest.cpp +++ b/llvm/unittests/Analysis/MemorySSATest.cpp @@ -1578,7 +1578,7 @@ TEST_F(MemorySSATest, TestLoopInvariantEntryBlockPointer) { for (auto &BB : *F) { if (BB.getName() == "exit") { // Get the store instruction - auto *SI = BB.getFirstNonPHI(); + auto *SI = &*BB.getFirstNonPHIIt(); // Get the memory access and location MemoryAccess *MA = MSSA.getMemoryAccess(SI); MemoryLocation ML = MemoryLocation::get(SI); diff --git a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp index f36d3ba99775b..519389d8e0b19 100644 --- a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp +++ b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp @@ -146,7 +146,7 @@ TEST_F(ProfileSummaryInfoTest, TestNoProfile) { EXPECT_FALSE(PSI.isHotBlock(&BB0, &BFI)); EXPECT_FALSE(PSI.isColdBlock(&BB0, &BFI)); - CallBase &CS1 = cast(*BB1->getFirstNonPHI()); + CallBase &CS1 = cast(*BB1->getFirstNonPHIIt()); EXPECT_FALSE(PSI.isHotCallSite(CS1, &BFI)); EXPECT_FALSE(PSI.isColdCallSite(CS1, &BFI)); } @@ -240,8 +240,8 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) { EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB2, &BFI)); EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB3, &BFI)); - CallBase &CS1 = cast(*BB1->getFirstNonPHI()); - auto *CI2 = BB2->getFirstNonPHI(); + CallBase &CS1 = cast(*BB1->getFirstNonPHIIt()); + BasicBlock::iterator CI2 = BB2->getFirstNonPHIIt(); CallBase &CS2 = cast(*CI2); EXPECT_TRUE(PSI.isHotCallSite(CS1, &BFI)); @@ -336,8 +336,8 @@ TEST_F(ProfileSummaryInfoTest, SampleProf) { EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB2, &BFI)); EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB3, &BFI)); - CallBase &CS1 = cast(*BB1->getFirstNonPHI()); - auto *CI2 = BB2->getFirstNonPHI(); + CallBase &CS1 = cast(*BB1->getFirstNonPHIIt()); + BasicBlock::iterator CI2 = BB2->getFirstNonPHIIt(); // Manually attach branch weights metadata to the call instruction. SmallVector Weights; Weights.push_back(1000); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index f620d2c968b3f..42616155f0cc3 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -6434,7 +6434,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { // Check entry block auto &EntryBlock = OutlinedFn->getEntryBlock(); - Instruction *Alloca1 = EntryBlock.getFirstNonPHI(); + Instruction *Alloca1 = &*EntryBlock.getFirstNonPHIIt(); EXPECT_NE(Alloca1, nullptr); EXPECT_TRUE(isa(Alloca1)); @@ -6469,7 +6469,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { // Check user code block auto *UserCodeBlock = EntryBlockBranch->getSuccessor(0); EXPECT_EQ(UserCodeBlock->getName(), "user_code.entry"); - auto *Load1 = UserCodeBlock->getFirstNonPHI(); + Instruction *Load1 = &*UserCodeBlock->getFirstNonPHIIt(); EXPECT_TRUE(isa(Load1)); auto *Load2 = Load1->getNextNode(); EXPECT_TRUE(isa(Load2)); @@ -6480,7 +6480,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { auto *OutlinedBlock = OutlinedBlockBr->getSuccessor(0); EXPECT_EQ(OutlinedBlock->getName(), "outlined.body"); - auto *Value1 = OutlinedBlock->getFirstNonPHI(); + Instruction *Value1 = &*OutlinedBlock->getFirstNonPHIIt(); EXPECT_EQ(Value1, Value); EXPECT_EQ(Value1->getNextNode(), TargetStore); auto *Deinit = TargetStore->getNextNode(); @@ -6496,7 +6496,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { // Check exit block auto *ExitBlock = EntryBlockBranch->getSuccessor(1); EXPECT_EQ(ExitBlock->getName(), "worker.exit"); - EXPECT_TRUE(isa(ExitBlock->getFirstNonPHI())); + EXPECT_TRUE(isa(ExitBlock->getFirstNonPHIIt())); // Check global exec_mode. GlobalVariable *Used = M->getGlobalVariable("llvm.compiler.used"); @@ -6804,7 +6804,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { // Check that we have moved our alloca created in the // BodyGenCB function, to the top of the function. - Instruction *Alloca1 = EntryBlock.getFirstNonPHI(); + Instruction *Alloca1 = &*EntryBlock.getFirstNonPHIIt(); EXPECT_NE(Alloca1, nullptr); EXPECT_TRUE(isa(Alloca1)); EXPECT_EQ(Alloca1, RaiseAlloca); @@ -6840,7 +6840,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { // Check user code block auto *UserCodeBlock = EntryBlockBranch->getSuccessor(0); EXPECT_EQ(UserCodeBlock->getName(), "user_code.entry"); - auto *Load1 = UserCodeBlock->getFirstNonPHI(); + BasicBlock::iterator Load1 = UserCodeBlock->getFirstNonPHIIt(); EXPECT_TRUE(isa(Load1)); auto *OutlinedBlockBr = Load1->getNextNode(); @@ -6849,7 +6849,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { auto *OutlinedBlock = OutlinedBlockBr->getSuccessor(0); EXPECT_EQ(OutlinedBlock->getName(), "outlined.body"); - auto *Load2 = OutlinedBlock->getFirstNonPHI(); + Instruction *Load2 = &*OutlinedBlock->getFirstNonPHIIt(); EXPECT_TRUE(isa(Load2)); EXPECT_EQ(Load2, Value); EXPECT_EQ(Load2->getNextNode(), TargetStore); @@ -6866,7 +6866,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { // Check exit block auto *ExitBlock = EntryBlockBranch->getSuccessor(1); EXPECT_EQ(ExitBlock->getName(), "worker.exit"); - EXPECT_TRUE(isa(ExitBlock->getFirstNonPHI())); + EXPECT_TRUE(isa(ExitBlock->getFirstNonPHIIt())); } TEST_F(OpenMPIRBuilderTest, CreateTask) { diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index ea20c87d6b09b..4283ba7a8f823 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -134,7 +134,7 @@ TEST(StripTest, LoopMetadata) { // we update the terminator's metadata correctly, we should be able to // observe the change in emission kind for the CU. auto getEmissionKind = [&]() { - Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); + Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt(); MDNode *LoopMD = I.getMetadata(LLVMContext::MD_loop); return cast(LoopMD->getOperand(1)) ->getScope() @@ -183,7 +183,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) { )"); // Find %b = add ... - Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); + Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt(); // Find the dbg.value using %b. SmallVector DVIs; @@ -268,7 +268,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) { !11 = !DILocation(line: 1, column: 1, scope: !6) )"); - Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); + Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt(); // Find the DbgVariableRecords using %b. SmallVector DVIs; @@ -319,7 +319,7 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) { !12 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 1, type: !10) )"); - Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); + Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt(); SmallVector DVIs; SmallVector DVRs; @@ -902,7 +902,7 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) { )"); // Find the first dbg.value, - Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); + Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt(); const DILocalVariable *Var = nullptr; const DIExpression *Expr = nullptr; const DILocation *Loc = nullptr; diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp index c1d3279688858..386a60702d0da 100644 --- a/llvm/unittests/IR/InstructionsTest.cpp +++ b/llvm/unittests/IR/InstructionsTest.cpp @@ -1716,7 +1716,7 @@ TEST(InstructionsTest, DropLocation) { cast(M->getNamedValue("no_parent_scope")); BasicBlock &BB = NoParentScopeF->front(); - auto *I1 = BB.getFirstNonPHI(); + auto *I1 = &*BB.getFirstNonPHIIt(); auto *I2 = I1->getNextNode(); auto *I3 = BB.getTerminator(); @@ -1738,7 +1738,7 @@ TEST(InstructionsTest, DropLocation) { cast(M->getNamedValue("with_parent_scope")); BasicBlock &BB = WithParentScopeF->front(); - auto *I2 = BB.getFirstNonPHI()->getNextNode(); + auto *I2 = BB.getFirstNonPHIIt()->getNextNode(); MDNode *Scope = cast(WithParentScopeF->getSubprogram()); EXPECT_EQ(I2->getDebugLoc().getLine(), 2U); diff --git a/llvm/unittests/Transforms/Scalar/LICMTest.cpp b/llvm/unittests/Transforms/Scalar/LICMTest.cpp index 5a986b067700c..98a69bbb47de1 100644 --- a/llvm/unittests/Transforms/Scalar/LICMTest.cpp +++ b/llvm/unittests/Transforms/Scalar/LICMTest.cpp @@ -63,7 +63,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) { BasicBlock *LoopBB = EntryBB.getUniqueSuccessor(); // Select `load i64, i64* %ptr`. - Instruction *IBefore = LoopBB->getFirstNonPHI(); + Instruction *IBefore = &*LoopBB->getFirstNonPHIIt(); // Make sure the right instruction was selected. ASSERT_TRUE(isa(IBefore)); // Upon this query SCEV caches disposition of SCEV. @@ -73,7 +73,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) { MPM.run(*M, MAM); // Select `load i64, i64* %ptr` after it was hoisted. - Instruction *IAfter = EntryBB.getFirstNonPHI(); + Instruction *IAfter = &*EntryBB.getFirstNonPHIIt(); // Make sure the right instruction was selected. ASSERT_TRUE(isa(IAfter)); diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp index 1293e4c921c9d..b2e3b5d32fbe2 100644 --- a/polly/lib/CodeGen/BlockGenerators.cpp +++ b/polly/lib/CodeGen/BlockGenerators.cpp @@ -508,7 +508,7 @@ Value *BlockGenerator::getOrCreateAlloca(const ScopArrayInfo *Array) { new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr, DL.getPrefTypeAlign(Ty), ScalarBase->getName() + NameExt); BasicBlock *EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock(); - Addr->insertBefore(&*EntryBB->getFirstInsertionPt()); + Addr->insertBefore(EntryBB->getFirstInsertionPt()); return Addr; } @@ -869,7 +869,7 @@ void BlockGenerator::createScalarFinalization(Scop &S) { // Create the merge PHI that merges the optimized and unoptimized version. PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2, EscapeInst->getName() + ".merge"); - MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt()); + MergePHI->insertBefore(MergeBB->getFirstInsertionPt()); // Add the respective values to the merge PHI. MergePHI->addIncoming(EscapeInstReload, OptExitBB); @@ -950,7 +950,7 @@ void BlockGenerator::createExitPHINodeMerges(Scop &S) { cast(OriginalValue)->getParent() != MergeBB) && "Original value must no be one we just generated."); auto *MergePHI = PHINode::Create(PHI->getType(), 2, Name + ".ph.merge"); - MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt()); + MergePHI->insertBefore(MergeBB->getFirstInsertionPt()); MergePHI->addIncoming(Reload, OptExitBB); MergePHI->addIncoming(OriginalValue, ExitBB); int Idx = PHI->getBasicBlockIndex(MergeBB); @@ -1384,7 +1384,7 @@ void RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, PHINode *PHI, unsigned NumIncoming = PHI->getNumIncomingValues(); PHINode *PHICopy = Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName()); - PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI()); + PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHIIt()); BBMap[PHI] = PHICopy; for (BasicBlock *IncomingBB : PHI->blocks()) diff --git a/polly/lib/CodeGen/LoopGenerators.cpp b/polly/lib/CodeGen/LoopGenerators.cpp index 5f772170d9628..f3975ccee44fa 100644 --- a/polly/lib/CodeGen/LoopGenerators.cpp +++ b/polly/lib/CodeGen/LoopGenerators.cpp @@ -185,7 +185,7 @@ Value *polly::createLoop(Value *LB, Value *UB, Value *Stride, DT.changeImmediateDominator(ExitBB, HeaderBB); // The loop body should be added here. - Builder.SetInsertPoint(HeaderBB->getFirstNonPHI()); + Builder.SetInsertPoint(HeaderBB->getFirstNonPHIIt()); return IV; } diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp index e32a69d47f69c..c9227ac0bfd10 100644 --- a/polly/lib/Transform/MaximalStaticExpansion.cpp +++ b/polly/lib/Transform/MaximalStaticExpansion.cpp @@ -169,7 +169,7 @@ class MaximalStaticExpansionImpl { } else if (SAI->isExitPHIKind()) { // For now, we are not able to expand ExitPhi. emitRemark(SAI->getName() + " is a ExitPhi node.", - S.getEnteringBlock()->getFirstNonPHI()); + &*S.getEnteringBlock()->getFirstNonPHIIt()); return false; } @@ -270,7 +270,7 @@ class MaximalStaticExpansionImpl { // No need to expand SAI with no write. if (NumberWrites == 0) { emitRemark(SAI->getName() + " has 0 write access.", - S.getEnteringBlock()->getFirstNonPHI()); + &*S.getEnteringBlock()->getFirstNonPHIIt()); return false; } From 02c6002d1cd2dabe4b98368f91e7b4395e5ab11d Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 24 Jan 2025 13:42:06 +0000 Subject: [PATCH 003/432] [lldb][AArch64] Add Guarded Control Stack registers (#123720) The Guarded Control Stack extension implements a shadow stack and the Linux kernel provides access to 3 registers for it via ptrace. struct user_gcs { __u64 features_enabled; __u64 features_locked; __u64 gcspr_el0; }; This commit adds support for reading those from a live process. The first 2 are pseudo registers based on the real control register and the 3rd is a real register. This is the stack pointer for the guarded stack. I have added a "gcs_" prefix to the "features" registers so that they have a clear name when shown individually. Also this means they will tab complete from "gcs", and be next to gcspr_el0 in any sorted lists of registers. Guarded Control Stack Registers: gcs_features_enabled = 0x0000000000000000 gcs_features_locked = 0x0000000000000000 gcspr_el0 = 0x0000000000000000 Testing is more of the usual, where possible I'm writing a register then doing something in the program to confirm the value was actually sent to ptrace. --- .../NativeRegisterContextLinux_arm64.cpp | 86 ++++++++++ .../Linux/NativeRegisterContextLinux_arm64.h | 16 ++ .../Utility/RegisterContextPOSIX_arm64.cpp | 4 + .../Utility/RegisterContextPOSIX_arm64.h | 1 + .../Utility/RegisterInfoPOSIX_arm64.cpp | 39 ++++- .../Process/Utility/RegisterInfoPOSIX_arm64.h | 7 + .../linux/aarch64/gcs/TestAArch64LinuxGCS.py | 152 ++++++++++++++++++ lldb/test/API/linux/aarch64/gcs/main.c | 23 ++- 8 files changed, 323 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp index 6056f3001fed6..efd3385c46e92 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp @@ -64,8 +64,14 @@ #define NT_ARM_FPMR 0x40e /* Floating point mode register */ #endif +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 /* Guarded Control Stack control registers */ +#endif + #define HWCAP_PACA (1 << 30) +#define HWCAP_GCS (1UL << 32) + #define HWCAP2_MTE (1 << 18) #define HWCAP2_FPMR (1UL << 48) @@ -150,6 +156,8 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux( opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskMTE); if (*auxv_at_hwcap2 & HWCAP2_FPMR) opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskFPMR); + if (*auxv_at_hwcap & HWCAP_GCS) + opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskGCS); } opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskTLS); @@ -193,6 +201,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64( ::memset(&m_pac_mask, 0, sizeof(m_pac_mask)); ::memset(&m_tls_regs, 0, sizeof(m_tls_regs)); ::memset(&m_sme_pseudo_regs, 0, sizeof(m_sme_pseudo_regs)); + ::memset(&m_gcs_regs, 0, sizeof(m_gcs_regs)); std::fill(m_zt_reg.begin(), m_zt_reg.end(), 0); m_mte_ctrl_reg = 0; @@ -213,6 +222,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64( m_tls_is_valid = false; m_zt_buffer_is_valid = false; m_fpmr_is_valid = false; + m_gcs_is_valid = false; // SME adds the tpidr2 register m_tls_size = GetRegisterInfo().IsSSVEPresent() ? sizeof(m_tls_regs) @@ -433,6 +443,14 @@ NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info, offset = reg_info->byte_offset - GetRegisterInfo().GetFPMROffset(); assert(offset < GetFPMRBufferSize()); src = (uint8_t *)GetFPMRBuffer() + offset; + } else if (IsGCS(reg)) { + error = ReadGCS(); + if (error.Fail()) + return error; + + offset = reg_info->byte_offset - GetRegisterInfo().GetGCSOffset(); + assert(offset < GetGCSBufferSize()); + src = (uint8_t *)GetGCSBuffer() + offset; } else return Status::FromErrorString( "failed - register wasn't recognized to be a GPR or an FPR, " @@ -657,6 +675,17 @@ Status NativeRegisterContextLinux_arm64::WriteRegister( ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size); return WriteFPMR(); + } else if (IsGCS(reg)) { + error = ReadGCS(); + if (error.Fail()) + return error; + + offset = reg_info->byte_offset - GetRegisterInfo().GetGCSOffset(); + assert(offset < GetGCSBufferSize()); + dst = (uint8_t *)GetGCSBuffer() + offset; + ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size); + + return WriteGCS(); } return Status::FromErrorString("Failed to write register value"); @@ -672,6 +701,7 @@ enum RegisterSetType : uint32_t { SME, // ZA only, because SVCR and SVG are pseudo registers. SME2, // ZT only. FPMR, + GCS, // Guarded Control Stack registers. }; static uint8_t *AddRegisterSetType(uint8_t *dst, @@ -759,6 +789,13 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) { return error; } + if (GetRegisterInfo().IsGCSPresent()) { + cached_size += sizeof(RegisterSetType) + GetGCSBufferSize(); + error = ReadGCS(); + if (error.Fail()) + return error; + } + // tpidr is always present but tpidr2 depends on SME. cached_size += sizeof(RegisterSetType) + GetTLSBufferSize(); error = ReadTLS(); @@ -867,6 +904,11 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues( GetFPMRBufferSize()); } + if (GetRegisterInfo().IsGCSPresent()) { + dst = AddSavedRegisters(dst, RegisterSetType::GCS, GetGCSBuffer(), + GetGCSBufferSize()); + } + dst = AddSavedRegisters(dst, RegisterSetType::TLS, GetTLSBuffer(), GetTLSBufferSize()); @@ -1020,6 +1062,11 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues( GetFPMRBuffer(), &src, GetFPMRBufferSize(), m_fpmr_is_valid, std::bind(&NativeRegisterContextLinux_arm64::WriteFPMR, this)); break; + case RegisterSetType::GCS: + error = RestoreRegisters( + GetGCSBuffer(), &src, GetGCSBufferSize(), m_gcs_is_valid, + std::bind(&NativeRegisterContextLinux_arm64::WriteGCS, this)); + break; } if (error.Fail()) @@ -1067,6 +1114,10 @@ bool NativeRegisterContextLinux_arm64::IsFPMR(unsigned reg) const { return GetRegisterInfo().IsFPMRReg(reg); } +bool NativeRegisterContextLinux_arm64::IsGCS(unsigned reg) const { + return GetRegisterInfo().IsGCSReg(reg); +} + llvm::Error NativeRegisterContextLinux_arm64::ReadHardwareDebugInfo() { if (!m_refresh_hwdebug_info) { return llvm::Error::success(); @@ -1215,6 +1266,7 @@ void NativeRegisterContextLinux_arm64::InvalidateAllRegisters() { m_tls_is_valid = false; m_zt_buffer_is_valid = false; m_fpmr_is_valid = false; + m_gcs_is_valid = false; // Update SVE and ZA registers in case there is change in configuration. ConfigureRegisterContext(); @@ -1400,6 +1452,40 @@ Status NativeRegisterContextLinux_arm64::WriteTLS() { return WriteRegisterSet(&ioVec, GetTLSBufferSize(), NT_ARM_TLS); } +Status NativeRegisterContextLinux_arm64::ReadGCS() { + Status error; + + if (m_gcs_is_valid) + return error; + + struct iovec ioVec; + ioVec.iov_base = GetGCSBuffer(); + ioVec.iov_len = GetGCSBufferSize(); + + error = ReadRegisterSet(&ioVec, GetGCSBufferSize(), NT_ARM_GCS); + + if (error.Success()) + m_gcs_is_valid = true; + + return error; +} + +Status NativeRegisterContextLinux_arm64::WriteGCS() { + Status error; + + error = ReadGCS(); + if (error.Fail()) + return error; + + struct iovec ioVec; + ioVec.iov_base = GetGCSBuffer(); + ioVec.iov_len = GetGCSBufferSize(); + + m_gcs_is_valid = false; + + return WriteRegisterSet(&ioVec, GetGCSBufferSize(), NT_ARM_GCS); +} + Status NativeRegisterContextLinux_arm64::ReadZAHeader() { Status error; diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h index 16190b5492582..7ed0da8503496 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h @@ -92,6 +92,7 @@ class NativeRegisterContextLinux_arm64 bool m_pac_mask_is_valid; bool m_tls_is_valid; size_t m_tls_size; + bool m_gcs_is_valid; struct user_pt_regs m_gpr_arm64; // 64-bit general purpose registers. @@ -136,6 +137,12 @@ class NativeRegisterContextLinux_arm64 uint64_t m_fpmr_reg; + struct gcs_regs { + uint64_t features_enabled; + uint64_t features_locked; + uint64_t gcspr_e0; + } m_gcs_regs; + bool IsGPR(unsigned reg) const; bool IsFPR(unsigned reg) const; @@ -166,6 +173,10 @@ class NativeRegisterContextLinux_arm64 Status WriteZA(); + Status ReadGCS(); + + Status WriteGCS(); + // No WriteZAHeader because writing only the header will disable ZA. // Instead use WriteZA and ensure you have the correct ZA buffer size set // beforehand if you wish to disable it. @@ -187,6 +198,7 @@ class NativeRegisterContextLinux_arm64 bool IsMTE(unsigned reg) const; bool IsTLS(unsigned reg) const; bool IsFPMR(unsigned reg) const; + bool IsGCS(unsigned reg) const; uint64_t GetSVERegVG() { return m_sve_header.vl / 8; } @@ -212,6 +224,8 @@ class NativeRegisterContextLinux_arm64 void *GetFPMRBuffer() { return &m_fpmr_reg; } + void *GetGCSBuffer() { return &m_gcs_regs; } + size_t GetSVEHeaderSize() { return sizeof(m_sve_header); } size_t GetPACMaskSize() { return sizeof(m_pac_mask); } @@ -234,6 +248,8 @@ class NativeRegisterContextLinux_arm64 size_t GetFPMRBufferSize() { return sizeof(m_fpmr_reg); } + size_t GetGCSBufferSize() { return sizeof(m_gcs_regs); } + llvm::Error ReadHardwareDebugInfo() override; llvm::Error WriteHardwareDebugRegs(DREGType hwbType) override; diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp index 575e9c8c81cbf..0233837f99d09 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp @@ -63,6 +63,10 @@ bool RegisterContextPOSIX_arm64::IsFPMR(unsigned reg) const { return m_register_info_up->IsFPMRReg(reg); } +bool RegisterContextPOSIX_arm64::IsGCS(unsigned reg) const { + return m_register_info_up->IsGCSReg(reg); +} + RegisterContextPOSIX_arm64::RegisterContextPOSIX_arm64( lldb_private::Thread &thread, std::unique_ptr register_info) diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h index 35ad56c98a7ae..de46c628d836d 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h +++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h @@ -59,6 +59,7 @@ class RegisterContextPOSIX_arm64 : public lldb_private::RegisterContext { bool IsSME(unsigned reg) const; bool IsMTE(unsigned reg) const; bool IsFPMR(unsigned reg) const; + bool IsGCS(unsigned reg) const; bool IsSVEZ(unsigned reg) const { return m_register_info_up->IsSVEZReg(reg); } bool IsSVEP(unsigned reg) const { return m_register_info_up->IsSVEPReg(reg); } diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp index f51a93e1b2dcb..c004c0f3c3cf5 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp @@ -97,6 +97,10 @@ static lldb_private::RegisterInfo g_register_infos_sme2[] = { static lldb_private::RegisterInfo g_register_infos_fpmr[] = { DEFINE_EXTENSION_REG(fpmr)}; +static lldb_private::RegisterInfo g_register_infos_gcs[] = { + DEFINE_EXTENSION_REG(gcs_features_enabled), + DEFINE_EXTENSION_REG(gcs_features_locked), DEFINE_EXTENSION_REG(gcspr_el0)}; + // Number of register sets provided by this context. enum { k_num_gpr_registers = gpr_w28 - gpr_x0 + 1, @@ -109,6 +113,7 @@ enum { // only for SME1 registers. k_num_sme_register = 3, k_num_fpmr_register = 1, + k_num_gcs_register = 3, k_num_register_sets_default = 2, k_num_register_sets = 3 }; @@ -221,6 +226,9 @@ static const lldb_private::RegisterSet g_reg_set_sme_arm64 = { static const lldb_private::RegisterSet g_reg_set_fpmr_arm64 = { "Floating Point Mode Register", "fpmr", k_num_fpmr_register, nullptr}; +static const lldb_private::RegisterSet g_reg_set_gcs_arm64 = { + "Guarded Control Stack Registers", "gcs", k_num_gcs_register, nullptr}; + RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64( const lldb_private::ArchSpec &target_arch, lldb_private::Flags opt_regsets) : lldb_private::RegisterInfoAndSetInterface(target_arch), @@ -273,6 +281,9 @@ RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64( if (m_opt_regsets.AllSet(eRegsetMaskFPMR)) AddRegSetFPMR(); + if (m_opt_regsets.AllSet(eRegsetMaskGCS)) + AddRegSetGCS(); + m_register_info_count = m_dynamic_reg_infos.size(); m_register_info_p = m_dynamic_reg_infos.data(); m_register_set_p = m_dynamic_reg_sets.data(); @@ -434,6 +445,24 @@ void RegisterInfoPOSIX_arm64::AddRegSetFPMR() { m_dynamic_reg_sets.back().registers = m_fpmr_regnum_collection.data(); } +void RegisterInfoPOSIX_arm64::AddRegSetGCS() { + uint32_t gcs_regnum = m_dynamic_reg_infos.size(); + for (uint32_t i = 0; i < k_num_gcs_register; i++) { + m_gcs_regnum_collection.push_back(gcs_regnum + i); + m_dynamic_reg_infos.push_back(g_register_infos_gcs[i]); + m_dynamic_reg_infos[gcs_regnum + i].byte_offset = + m_dynamic_reg_infos[gcs_regnum + i - 1].byte_offset + + m_dynamic_reg_infos[gcs_regnum + i - 1].byte_size; + m_dynamic_reg_infos[gcs_regnum + i].kinds[lldb::eRegisterKindLLDB] = + gcs_regnum + i; + } + + m_per_regset_regnum_range[m_register_set_count] = + std::make_pair(gcs_regnum, m_dynamic_reg_infos.size()); + m_dynamic_reg_sets.push_back(g_reg_set_gcs_arm64); + m_dynamic_reg_sets.back().registers = m_gcs_regnum_collection.data(); +} + uint32_t RegisterInfoPOSIX_arm64::ConfigureVectorLengthSVE(uint32_t sve_vq) { // sve_vq contains SVE Quad vector length in context of AArch64 SVE. // SVE register infos if enabled cannot be disabled by selecting sve_vq = 0. @@ -561,6 +590,10 @@ bool RegisterInfoPOSIX_arm64::IsFPMRReg(unsigned reg) const { return llvm::is_contained(m_fpmr_regnum_collection, reg); } +bool RegisterInfoPOSIX_arm64::IsGCSReg(unsigned reg) const { + return llvm::is_contained(m_gcs_regnum_collection, reg); +} + uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEZ0() const { return sve_z0; } uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEFFR() const { return sve_ffr; } @@ -593,4 +626,8 @@ uint32_t RegisterInfoPOSIX_arm64::GetSMEOffset() const { uint32_t RegisterInfoPOSIX_arm64::GetFPMROffset() const { return m_register_info_p[m_fpmr_regnum_collection[0]].byte_offset; -} \ No newline at end of file +} + +uint32_t RegisterInfoPOSIX_arm64::GetGCSOffset() const { + return m_register_info_p[m_gcs_regnum_collection[0]].byte_offset; +} diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h index 16a951ef0935f..d2ddf7d86d8c3 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h +++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h @@ -33,6 +33,7 @@ class RegisterInfoPOSIX_arm64 eRegsetMaskZA = 32, eRegsetMaskZT = 64, eRegsetMaskFPMR = 128, + eRegsetMaskGCS = 256, eRegsetMaskDynamic = ~1, }; @@ -113,6 +114,8 @@ class RegisterInfoPOSIX_arm64 void AddRegSetFPMR(); + void AddRegSetGCS(); + uint32_t ConfigureVectorLengthSVE(uint32_t sve_vq); void ConfigureVectorLengthZA(uint32_t za_vq); @@ -132,6 +135,7 @@ class RegisterInfoPOSIX_arm64 bool IsMTEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); } bool IsTLSPresent() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); } bool IsFPMRPresent() const { return m_opt_regsets.AnySet(eRegsetMaskFPMR); } + bool IsGCSPresent() const { return m_opt_regsets.AnySet(eRegsetMaskGCS); } bool IsSVEReg(unsigned reg) const; bool IsSVEZReg(unsigned reg) const; @@ -144,6 +148,7 @@ class RegisterInfoPOSIX_arm64 bool IsSMERegZA(unsigned reg) const; bool IsSMERegZT(unsigned reg) const; bool IsFPMRReg(unsigned reg) const; + bool IsGCSReg(unsigned reg) const; uint32_t GetRegNumSVEZ0() const; uint32_t GetRegNumSVEFFR() const; @@ -156,6 +161,7 @@ class RegisterInfoPOSIX_arm64 uint32_t GetTLSOffset() const; uint32_t GetSMEOffset() const; uint32_t GetFPMROffset() const; + uint32_t GetGCSOffset() const; private: typedef std::map> @@ -188,6 +194,7 @@ class RegisterInfoPOSIX_arm64 std::vector m_tls_regnum_collection; std::vector m_sme_regnum_collection; std::vector m_fpmr_regnum_collection; + std::vector m_gcs_regnum_collection; }; #endif diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py index 0928ff8e14e00..d3d4dbecf4a2a 100644 --- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py +++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py @@ -83,3 +83,155 @@ def test_gcs_fault(self): "stop reason = signal SIGSEGV: control protection fault", ], ) + + @skipUnlessArch("aarch64") + @skipUnlessPlatform(["linux"]) + def test_gcs_registers(self): + if not self.isAArch64GCS(): + self.skipTest("Target must support GCS.") + + self.build() + self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) + + self.runCmd("b test_func") + self.runCmd("b test_func2") + self.runCmd("run", RUN_SUCCEEDED) + + if self.process().GetState() == lldb.eStateExited: + self.fail("Test program failed to run.") + + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + self.expect("register read --all", substrs=["Guarded Control Stack Registers:"]) + + # This helper reads all the GCS registers and optionally compares them + # against a previous state, then returns the current register values. + def check_gcs_registers( + expected_gcs_features_enabled=None, + expected_gcs_features_locked=None, + expected_gcspr_el0=None, + ): + thread = self.dbg.GetSelectedTarget().process.GetThreadAtIndex(0) + registerSets = thread.GetFrameAtIndex(0).GetRegisters() + gcs_registers = registerSets.GetFirstValueByName( + r"Guarded Control Stack Registers" + ) + + gcs_features_enabled = gcs_registers.GetChildMemberWithName( + "gcs_features_enabled" + ).GetValueAsUnsigned() + if expected_gcs_features_enabled is not None: + self.assertEqual(expected_gcs_features_enabled, gcs_features_enabled) + + gcs_features_locked = gcs_registers.GetChildMemberWithName( + "gcs_features_locked" + ).GetValueAsUnsigned() + if expected_gcs_features_locked is not None: + self.assertEqual(expected_gcs_features_locked, gcs_features_locked) + + gcspr_el0 = gcs_registers.GetChildMemberWithName( + "gcspr_el0" + ).GetValueAsUnsigned() + if expected_gcspr_el0 is not None: + self.assertEqual(expected_gcspr_el0, gcspr_el0) + + return gcs_features_enabled, gcs_features_locked, gcspr_el0 + + enabled, locked, spr_el0 = check_gcs_registers() + + # Features enabled should have at least the enable bit set, it could have + # others depending on what the C library did, but we can't rely on always + # having them. + self.assertTrue(enabled & 1, "Expected GCS enable bit to be set.") + + # Features locked we cannot predict, we will just assert that it remains + # the same as we continue. + + # spr_el0 will point to some memory region that is a shadow stack region. + self.expect(f"memory region {spr_el0}", substrs=["shadow stack: yes"]) + + # Continue into test_func2, where the GCS pointer should have been + # decremented, and the other registers remain the same. + self.runCmd("continue") + + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + _, _, spr_el0 = check_gcs_registers(enabled, locked, spr_el0 - 8) + + # Any combination of GCS feature lock bits might have been set by the C + # library, and could be set to 0 or 1. To check that we can modify them, + # invert one of those bits then write it back to the lock register. + # The stack pushing feature is bit 2 of that register. + STACK_PUSH = 2 + # Get the original value of the stack push lock bit. + stack_push = bool((locked >> STACK_PUSH) & 1) + # Invert the value and put it back into the set of lock bits. + new_locked = (locked & ~(1 << STACK_PUSH)) | (int(not stack_push) << STACK_PUSH) + # Write the new lock bits, which are the same as before, only with stack + # push locked (if it was previously unlocked), or unlocked (if it was + # previously locked). + self.runCmd(f"register write gcs_features_locked 0x{new_locked:x}") + # We should be able to read back this new set of lock bits. + self.expect( + f"register read gcs_features_locked", + substrs=[f"gcs_features_locked = 0x{new_locked:016x}"], + ) + + # We could prove the write made it to hardware by trying to prctl() to + # enable or disable the stack push feature here, but because the libc + # may or may not have locked it, it's tricky to coordinate this. Given + # that we know the other registers can be written and their values are + # seen by the process, we can assume this is too. + + # Restore the original lock bits, as the libc may rely on being able + # to use certain features during program execution. + self.runCmd(f"register write gcs_features_locked 0x{locked:x}") + + # Modify the guarded control stack pointer to cause a fault. + spr_el0 += 8 + self.runCmd(f"register write gcspr_el0 {spr_el0}") + self.expect( + "register read gcspr_el0", substrs=[f"gcspr_el0 = 0x{spr_el0:016x}"] + ) + + # If we wrote it back correctly, we will now fault. Don't pass this signal + # to the application, as we will continue past it later. + self.runCmd("process handle SIGSEGV --pass false") + self.runCmd("continue") + + self.expect( + "thread list", + "Expected stopped by SIGSEGV.", + substrs=[ + "stopped", + "stop reason = signal SIGSEGV: control protection fault", + ], + ) + + # Now to prove we can write gcs_features_enabled, disable GCS and continue + # past the fault we caused. Note that although the libc likely locked the + # ability to disable GCS, ptrace bypasses the lock bits. + enabled &= ~1 + self.runCmd(f"register write gcs_features_enabled {enabled}") + self.expect( + "register read gcs_features_enabled", + substrs=[f"gcs_features_enabled = 0x{enabled:016x}"], + ) + + # With GCS disabled, the invalid guarded control stack pointer is not + # checked, so the program can finish normally. + self.runCmd("continue") + self.expect( + "process status", + substrs=[ + "exited with status = 0", + ], + ) diff --git a/lldb/test/API/linux/aarch64/gcs/main.c b/lldb/test/API/linux/aarch64/gcs/main.c index 32a9b07c20743..09354639af376 100644 --- a/lldb/test/API/linux/aarch64/gcs/main.c +++ b/lldb/test/API/linux/aarch64/gcs/main.c @@ -2,8 +2,8 @@ #include #include -#ifndef HWCAP2_GCS -#define HWCAP2_GCS (1UL << 63) +#ifndef HWCAP_GCS +#define HWCAP_GCS (1UL << 32) #endif #define PR_GET_SHADOW_STACK_STATUS 74 @@ -49,8 +49,14 @@ void gcs_signal() { "ret\n"); } +// These functions are used to observe gcspr_el0 changing as we enter them, and +// the fault we cause by changing its value. +void test_func2() { volatile int i = 99; } + +void test_func() { test_func2(); } + int main() { - if (!(getauxval(AT_HWCAP2) & HWCAP2_GCS)) + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) return 1; unsigned long mode = get_gcs_status(); @@ -63,7 +69,16 @@ int main() { } // By now we should have one memory region where the GCS is stored. - gcs_signal(); // Set break point at this line. + + // For register read/write tests. + test_func(); + + // If this was a register test, we would have disabled GCS during the + // test_func call. We cannot re-enable it from ptrace so skip this part in + // this case. + mode = get_gcs_status(); + if ((mode & 1) == 1) + gcs_signal(); // Set break point at this line. return 0; } From 11b040192640ef3b1f481124c440f464ed6ec86a Mon Sep 17 00:00:00 2001 From: Aaditya <115080342+easyonaadit@users.noreply.github.com> Date: Fri, 24 Jan 2025 19:13:40 +0530 Subject: [PATCH 004/432] [AMDGPU] Restore SP from saved-FP or saved-BP (#124007) Currently, the AMDGPU backend bumps the Stack Pointer by fixed size offsets in the prolog of device functions, and restores it by the same amount in the epilog. Prolog: sp += frameSize Epilog: sp -= frameSize If a function has dynamic stack realignment, Prolog: sp += frameSize + max_alignment Epilog: sp -= frameSize + max_alignment These calculations are not optimal in case of dynamic stack realignment, and completely fail in case of dynamic stack readjustment. This patch uses the saved Frame Pointer to restore SP. Prolog: fp = sp sp += frameSize Epilog: sp = fp In case of dynamic stack realignment, SP is restored from the saved Base Pointer. Prolog: fp = sp + (max_alignment - 1) fp = fp & (-max_alignment) bp = sp sp += frameSize + max_alignment Epilog: sp = bp (Note: The presence of BP has been enforced in case of any dynamic stack realignment.) --------- Co-authored-by: Pravin Jagtap Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 20 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 +- .../CodeGen/AMDGPU/GlobalISel/assert-align.ll | 2 +- .../GlobalISel/call-outgoing-stack-args.ll | 8 +- .../GlobalISel/dynamic-alloca-uniform.ll | 48 +- .../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 +- .../AMDGPU/GlobalISel/non-entry-alloca.ll | 7 +- .../abi-attribute-hints-undefined-behavior.ll | 2 +- .../amdgpu-simplify-libcall-pow-codegen.ll | 10 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 72 +- ...ffer-fat-pointers-contents-legalization.ll | 10 +- llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 96 +- .../CodeGen/AMDGPU/call-argument-types.ll | 40 +- .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 59 +- .../callee-special-input-vgprs-packed.ll | 2 +- .../AMDGPU/callee-special-input-vgprs.ll | 2 +- .../AMDGPU/cross-block-use-is-not-abi-copy.ll | 8 +- .../AMDGPU/dwarf-multi-register-use-crash.ll | 2 +- .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 184 ++-- .../eliminate-frame-index-v-add-co-u32.mir | 240 +++-- .../eliminate-frame-index-v-add-u32.mir | 25 +- .../fix-frame-reg-in-custom-csr-spills.ll | 7 +- ...frame-setup-without-sgpr-to-vgpr-spills.ll | 4 +- .../CodeGen/AMDGPU/function-args-inreg.ll | 8 +- .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll | 4 +- .../AMDGPU/gfx-callable-argument-types.ll | 959 +++++++++--------- .../gfx-callable-preserved-registers.ll | 60 +- .../AMDGPU/gfx-callable-return-types.ll | 150 +-- llvm/test/CodeGen/AMDGPU/global-alias.ll | 2 +- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 32 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 2 +- .../CodeGen/AMDGPU/insert-waitcnts-crash.ll | 2 +- .../local-stack-alloc-block-sp-reference.ll | 10 +- .../materialize-frame-index-sgpr.gfx10.ll | 49 +- .../CodeGen/AMDGPU/mul24-pass-ordering.ll | 2 +- .../AMDGPU/need-fp-from-vgpr-spills.ll | 8 +- llvm/test/CodeGen/AMDGPU/nested-calls.ll | 3 +- .../AMDGPU/no-source-locations-in-prologue.ll | 3 +- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 14 +- .../AMDGPU/pei-scavenge-sgpr-carry-out.mir | 54 +- .../CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir | 10 +- .../test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir | 5 +- .../AMDGPU/pei-scavenge-vgpr-spill.mir | 15 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 6 +- .../AMDGPU/schedule-amdgpu-trackers.ll | 4 +- .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir | 2 +- .../AMDGPU/sgpr-spills-split-regalloc.ll | 6 +- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 2 +- llvm/test/CodeGen/AMDGPU/stack-realign.ll | 16 +- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 14 +- .../AMDGPU/strictfp_f16_abi_promote.ll | 18 +- .../AMDGPU/tail-call-inreg-arguments.error.ll | 4 +- .../AMDGPU/unstructured-cfg-def-use-issue.ll | 4 +- .../CodeGen/AMDGPU/use_restore_frame_reg.mir | 10 +- .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 12 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +- .../AMDGPU/whole-wave-register-copy.ll | 2 +- .../AMDGPU/whole-wave-register-spill.ll | 4 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 8 +- ...dgpu_generated_funcs.ll.generated.expected | 4 +- ...pu_generated_funcs.ll.nogenerated.expected | 4 +- 61 files changed, 1305 insertions(+), 1064 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 2e2523312840a..060db477a59f8 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1256,6 +1256,18 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, Register FramePtrReg = FuncInfo->getFrameOffsetReg(); bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); + if (RoundedSize != 0) { + if (TRI.hasBasePointer(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg) + .addReg(TRI.getBaseRegister()) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (hasFP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameDestroy); + } + } + Register FramePtrRegScratchCopy; Register SGPRForFPSaveRestoreCopy = FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); @@ -1280,14 +1292,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, FramePtrRegScratchCopy); } - if (RoundedSize != 0 && hasFP(MF)) { - auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(-static_cast(RoundedSize * getScratchScaleFactor(ST))) - .setMIFlag(MachineInstr::FrameDestroy); - Add->getOperand(3).setIsDead(); // Mark SCC as dead. - } - if (FPSaved) { // Insert the copy to restore FP. Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 11121e6058770..6fc57dec6a826 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -525,8 +525,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { // When we need stack realignment, we can't reference off of the // stack pointer, so we reserve a base pointer. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.getNumFixedObjects() && shouldRealignStack(MF); + return shouldRealignStack(MF); } Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll index 604caf572b0fe..c477732e5cd59 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 974ce492daea8..410d3b1bb7062 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -247,11 +247,11 @@ define void @func_caller_stack() { ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -286,11 +286,11 @@ define void @func_caller_stack() { ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index ae055ea041297..6b767d9e754be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -80,13 +80,13 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s6, s4 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -103,7 +103,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -112,7 +111,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s6, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s32, s33 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4: @@ -127,7 +127,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, s32 -; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -136,9 +135,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s32, s2, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 4 %alloca = alloca i32, i32 %n, addrspace(5) @@ -221,13 +221,13 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s6, s4 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -244,7 +244,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -253,7 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s6, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s32, s33 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16: @@ -268,7 +268,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, s32 -; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -277,9 +276,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s32, s2, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 16 %alloca = alloca i32, i32 %n, addrspace(5) @@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: s_mov_b32 s7, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x1000 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -373,7 +375,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s5, s4 -; GFX9-NEXT: s_addk_i32 s32, 0xf000 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_mov_b32 s34, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -382,8 +385,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0 -; GFX10-NEXT: s_addk_i32 s32, 0x800 +; GFX10-NEXT: s_mov_b32 s7, s34 ; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GFX10-NEXT: s_mov_b32 s34, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x800 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 @@ -401,7 +406,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s5, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xf800 +; GFX10-NEXT: s_mov_b32 s32, s34 +; GFX10-NEXT: s_mov_b32 s34, s7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32: @@ -409,8 +415,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 31 -; GFX11-NEXT: s_add_i32 s32, s32, 64 +; GFX11-NEXT: s_mov_b32 s3, s34 ; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_mov_b32 s34, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 64 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 @@ -429,8 +437,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-NEXT: s_add_u32 s32, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_addk_i32 s32, 0xffc0 +; GFX11-NEXT: s_mov_b32 s32, s34 +; GFX11-NEXT: s_mov_b32 s34, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv %alloca = alloca i32, i32 %n, align 32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 009beeb395100..767232a01c7e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) { ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 69abef02d3d92..34cf6905fe75b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_mov_b32 s33, s7 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -216,8 +216,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 +; GCN-NEXT: s_mov_b32 s8, s34 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_addk_i32 s32, 0x2000 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 @@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_addk_i32 s32, 0xe000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s8 ; GCN-NEXT: s_mov_b32 s33, s7 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index e53653408feb4..194a23fa0d4a9 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 { ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 +; FIXEDABI-NEXT: s_mov_b32 s32, s33 ; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7] -; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 ; FIXEDABI-NEXT: s_mov_b32 s33, s4 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 25b6b7be1f3b5..ab2363860af9d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -193,11 +193,11 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -329,11 +329,11 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -477,11 +477,11 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -614,11 +614,11 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: v_readlane_b32 s34, v42, 2 ; CHECK-NEXT: v_readlane_b32 s31, v42, 1 ; CHECK-NEXT: v_readlane_b32 s30, v42, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v42, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -761,11 +761,11 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index ec469b3020cce..0382cc72a36ae 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3798,10 +3798,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v2, 1 ; GCN-NEXT: v_readlane_b32 s30, v2, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -3829,10 +3829,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3858,10 +3858,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3887,10 +3887,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3917,11 +3917,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3947,10 +3947,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3990,10 +3990,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v4, 1 ; GCN-NEXT: v_readlane_b32 s30, v4, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4026,10 +4026,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4055,10 +4055,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4084,10 +4084,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4114,11 +4114,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4144,10 +4144,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4189,10 +4189,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v5, 1 ; GCN-NEXT: v_readlane_b32 s30, v5, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4227,10 +4227,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4259,10 +4259,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v4, 1 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4290,10 +4290,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4322,11 +4322,11 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4354,10 +4354,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4407,10 +4407,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v8, 1 ; GCN-NEXT: v_readlane_b32 s30, v8, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4453,10 +4453,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v6, 1 ; GFX7-NEXT: v_readlane_b32 s30, v6, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4485,10 +4485,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v4, 1 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4516,10 +4516,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4548,11 +4548,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4578,10 +4578,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4651,10 +4651,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v16, 1 ; GCN-NEXT: v_readlane_b32 s30, v16, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4717,10 +4717,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v10, 1 ; GFX7-NEXT: v_readlane_b32 s30, v10, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4755,10 +4755,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v6, 1 ; GFX8-NEXT: v_readlane_b32 s30, v6, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4790,10 +4790,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v5, 1 ; GFX9-NEXT: v_readlane_b32 s30, v5, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4826,11 +4826,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v5, 1 ; GFX10-NEXT: v_readlane_b32 s30, v5, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4856,10 +4856,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4969,10 +4969,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v20, 1 ; GCN-NEXT: v_readlane_b32 s30, v20, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -5075,10 +5075,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v18, 1 ; GFX7-NEXT: v_readlane_b32 s30, v18, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5125,10 +5125,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v10, 1 ; GFX8-NEXT: v_readlane_b32 s30, v10, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5168,10 +5168,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v9, 1 ; GFX9-NEXT: v_readlane_b32 s30, v9, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5212,11 +5212,11 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v9, 1 ; GFX10-NEXT: v_readlane_b32 s30, v9, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5244,10 +5244,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v9, 1 ; GFX11-NEXT: v_readlane_b32 s30, v9, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index cdfaed0a203e9..9912ce3604a49 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -3099,8 +3099,11 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_mov_b32 s4, s33 ; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0 ; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800 +; SDAG-NEXT: s_mov_b32 s5, s34 +; SDAG-NEXT: s_mov_b32 s34, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x1800 -; SDAG-NEXT: s_addk_i32 s32, 0xe800 +; SDAG-NEXT: s_mov_b32 s32, s34 +; SDAG-NEXT: s_mov_b32 s34, s5 ; SDAG-NEXT: s_mov_b32 s33, s4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3124,10 +3127,13 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_mov_b32 s4, s33 ; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0 ; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800 +; SDAG-NEXT: s_mov_b32 s5, s34 +; SDAG-NEXT: s_mov_b32 s34, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x1000 ; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 -; SDAG-NEXT: s_addk_i32 s32, 0xf000 +; SDAG-NEXT: s_mov_b32 s32, s34 +; SDAG-NEXT: s_mov_b32 s34, s5 ; SDAG-NEXT: s_mov_b32 s33, s4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index 93a4469c7718e..704b68aa296a9 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -51,11 +51,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -79,11 +79,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -111,11 +111,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -139,11 +139,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -171,11 +171,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -199,11 +199,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -232,11 +232,11 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -260,11 +260,11 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -293,11 +293,11 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -321,11 +321,11 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -355,11 +355,11 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -383,11 +383,11 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -418,11 +418,11 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -446,11 +446,11 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -485,11 +485,11 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -513,11 +513,11 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -545,11 +545,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -573,11 +573,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -605,11 +605,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -633,11 +633,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -665,11 +665,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -693,11 +693,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -726,11 +726,11 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -754,11 +754,11 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -786,11 +786,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -814,11 +814,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -847,11 +847,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -875,11 +875,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -908,11 +908,11 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -936,11 +936,11 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -969,11 +969,11 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -997,11 +997,11 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1030,11 +1030,11 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1058,11 +1058,11 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1091,11 +1091,11 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1119,11 +1119,11 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1151,11 +1151,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1179,11 +1179,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1214,11 +1214,11 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1242,11 +1242,11 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1275,11 +1275,11 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1303,11 +1303,11 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1339,11 +1339,11 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1367,11 +1367,11 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1411,11 +1411,11 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_swappc_b64 s[30:31], vcc ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1439,11 +1439,11 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1486,11 +1486,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1514,11 +1514,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 26ab0f3ce6355..35d00390067d8 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5873,11 +5873,11 @@ define void @stack_12xv3i32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5941,11 +5941,11 @@ define void @stack_12xv3i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -6009,11 +6009,11 @@ define void @stack_12xv3i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6059,11 +6059,11 @@ define void @stack_12xv3i32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6127,11 +6127,11 @@ define void @stack_12xv3i32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] @@ -6212,11 +6212,11 @@ define void @stack_12xv3f32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6280,11 +6280,11 @@ define void @stack_12xv3f32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -6348,11 +6348,11 @@ define void @stack_12xv3f32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6402,11 +6402,11 @@ define void @stack_12xv3f32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6470,11 +6470,11 @@ define void @stack_12xv3f32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] @@ -6563,11 +6563,11 @@ define void @stack_8xv5i32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6639,11 +6639,11 @@ define void @stack_8xv5i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -6715,11 +6715,11 @@ define void @stack_8xv5i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6770,11 +6770,11 @@ define void @stack_8xv5i32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6846,11 +6846,11 @@ define void @stack_8xv5i32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] @@ -6935,11 +6935,11 @@ define void @stack_8xv5f32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7011,11 +7011,11 @@ define void @stack_8xv5f32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -7087,11 +7087,11 @@ define void @stack_8xv5f32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7145,11 +7145,11 @@ define void @stack_8xv5f32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7221,11 +7221,11 @@ define void @stack_8xv5f32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index e926a3c728cbd..6fb071dd42d2f 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -58,8 +58,8 @@ define void @callee_with_stack() #0 { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33{{$}} ; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_addk_i32 s32, 0xfe00 -; FLATSCR-NEXT: s_add_i32 s32, s32, -8 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { @@ -106,13 +106,13 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 { ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] +; MUBUF: s_mov_b32 s32, s33{{$}} +; FLATSCR: s_mov_b32 s32, s33{{$}} ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfc00{{$}} -; FLATSCR: s_add_i32 s32, s32, -16{{$}} ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -149,13 +149,13 @@ define void @callee_with_stack_and_call() #0 { ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 +; MUBUF: s_mov_b32 s32, s33 +; FLATSCR: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -253,10 +253,10 @@ define void @spill_only_csr_sgpr() { ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: s_addk_i32 s32, 0xfd00 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_mov_b32 s33, s4 ; FLATSCR: s_add_i32 s32, s32, 12 -; FLATSCR-NEXT: s_add_i32 s32, s32, -12 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -282,16 +282,14 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 +; MUBUF: s_mov_b32 s32, s33 +; FLATSCR: s_mov_b32 s32, s33 + ; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -333,12 +331,12 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -366,16 +364,22 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; MUBUF-NEXT: s_mov_b32 s5, s34 +; FLATSCR-NEXT: s_mov_b32 s1, s34 +; MUBUF-NEXT: s_mov_b32 s34, s32 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 ; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; MUBUF-NEXT: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}} ; MUBUF-NEXT: buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}} -; FLATSCR-NEXT: s_add_i32 s1, s33, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s1 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe80000 -; FLATSCR-NEXT: s_addk_i32 s32, 0xa000 +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s5 +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s1 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { @@ -399,16 +403,13 @@ define void @realign_stack_no_fp_elim() #1 { ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART -; MUBUF: s_addk_i32 s32, 0x300 -; FLATSCR: s_add_i32 s32, s32, 12 ; GCN: v_readlane_b32 s31, [[CSR_VGPR]], 1 ; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 +;GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfd00 -; FLATSCR: s_add_i32 s32, s32, -12 ; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -435,19 +436,19 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN: v_mov_b32_e32 -; MUBUF-DAG: buffer_store_dword -; FLATSCR-DAG: scratch_store_dword ; MUBUF: s_addk_i32 s32, 0x300{{$}} ; FLATSCR: s_add_i32 s32, s32, 12{{$}} +; MUBUF-DAG: buffer_store_dword +; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART +; GCN: s_mov_b32 s32, s33 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfd00{{$}} -; FLATSCR: s_add_i32 s32, s32, -12{{$}} ; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -495,8 +496,6 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}} -; FLATSCR: s_addk_i32 s32, 0xeff4{{$}} ; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -538,8 +537,6 @@ define internal void @local_empty_func() #0 { ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}} ; GCN: s_swappc_b64 -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 ; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index b52e7918b27ab..f85cea1ba03fb 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -428,8 +428,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GCN: s_swappc_b64 +; GCN: s_mov_b32 s32, s33 ; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_addk_i32 s32, 0xfc00{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 9792c9dabac2f..2399112e3fefb 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -441,8 +441,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GCN: s_swappc_b64 +; GCN: s_mov_b32 s32, s33 ; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_addk_i32 s32, 0xfc00{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 5e6152661aeec..9bef0b7d76ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -42,11 +42,11 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -78,11 +78,11 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -114,11 +114,11 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -151,11 +151,11 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 2cd3916165fe7..8b02bdbb70b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -91,11 +91,11 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: v_readlane_b32 s34, v41, 2 ; CHECK-NEXT: v_readlane_b32 s31, v41, 1 ; CHECK-NEXT: v_readlane_b32 s30, v41, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v41, 16 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 9acb3a42ae102..d61c4b46596c0 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -1084,7 +1084,7 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1113,7 +1113,7 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1140,12 +1140,11 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform: @@ -1171,11 +1170,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca @@ -1189,10 +1189,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1210,7 +1212,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1220,10 +1223,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1241,7 +1246,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1251,17 +1257,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 @@ -1274,8 +1281,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: @@ -1284,17 +1291,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 ; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 @@ -1307,7 +1315,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 128, addrspace(5) store volatile i32 10, ptr addrspace(5) %alloca @@ -1340,7 +1349,7 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1369,7 +1378,7 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1396,12 +1405,11 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: @@ -1427,11 +1435,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 2, addrspace(5) store volatile i32 22, ptr addrspace(5) %alloca @@ -1465,7 +1474,7 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1495,7 +1504,7 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1524,12 +1533,11 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent: @@ -1557,11 +1565,12 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, addrspace(5) @@ -1575,6 +1584,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff @@ -1598,7 +1609,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1609,10 +1621,12 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1630,7 +1644,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1640,14 +1655,16 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1665,7 +1682,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: @@ -1674,12 +1692,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 @@ -1699,7 +1718,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 128, addrspace(5) @@ -1734,7 +1754,7 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1764,7 +1784,7 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,12 +1813,11 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: @@ -1826,11 +1845,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 2, addrspace(5) @@ -1844,9 +1864,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s13, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s14, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 @@ -1915,7 +1937,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xd000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s14 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s13 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1924,9 +1947,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s13, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s14, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_6 @@ -1994,7 +2019,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xd000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s14 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s13 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2003,9 +2029,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s8, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-SDAG-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 @@ -2079,9 +2107,9 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s8 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff40 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: @@ -2089,9 +2117,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s7, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s8, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_6 @@ -2162,8 +2192,9 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s8 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s7 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff40 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 @@ -2189,9 +2220,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s11, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2243,7 +2276,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s11 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2253,9 +2287,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s11, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s12, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2307,7 +2343,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: .LBB15_8: ; %bb.2 ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s12 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s11 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2317,9 +2354,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x80 ; GFX11-SDAG-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2376,8 +2415,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2386,9 +2425,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s5, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s6, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x80 ; GFX11-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2446,7 +2487,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: .LBB15_8: ; %bb.2 ; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s6 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -2492,7 +2534,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2522,7 +2564,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2551,12 +2593,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: @@ -2584,11 +2625,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i16 %n, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca @@ -2621,7 +2663,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2650,7 +2692,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2677,12 +2719,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: @@ -2708,11 +2749,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i64 %n, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 831e246426ba7..b5a9f02711016 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -2052,112 +2052,136 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0 renamable $vgpr0, renamable dead $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec @@ -2180,115 +2204,139 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec @@ -2311,112 +2359,136 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0 renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec @@ -2439,115 +2511,139 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $sgpr6 = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr6, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $sgpr6 = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr6, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir index de198941b565e..b7a5cf963138f 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir @@ -1708,42 +1708,51 @@ machineFunctionInfo: body: | bb.0: ; MUBUF-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error - ; MUBUF: liveins: $sgpr4 + ; MUBUF: liveins: $sgpr4, $sgpr5 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 ; ; MUBUFW32-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error - ; MUBUFW32: liveins: $sgpr4 + ; MUBUFW32: liveins: $sgpr4, $sgpr5 ; MUBUFW32-NEXT: {{ $}} ; MUBUFW32-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; MUBUFW32-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262112, implicit-def $scc ; MUBUFW32-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc + ; MUBUFW32-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; MUBUFW32-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUFW32-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1048576, implicit-def dead $scc ; MUBUFW32-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 5, $sgpr33, implicit $exec ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; MUBUFW32-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1048576, implicit-def dead $scc + ; MUBUFW32-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; MUBUFW32-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; MUBUFW32-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error - ; FLATSCRW64: liveins: $sgpr4 + ; FLATSCRW64: liveins: $sgpr4, $sgpr5 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; FLATSCRW64-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCRW64-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; FLATSCRW64-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; FLATSCRW64-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCRW64-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; FLATSCRW64-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; FLATSCRW64-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; FLATSCRW64-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; FLATSCRW64-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; FLATSCRW64-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; FLATSCRW64-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 renamable $vgpr0 = V_ADD_U32_e64 64, %stack.1, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll index 8e0750195b3b4..c4063aecb6ed7 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -18,8 +18,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: s_addk_i32 s32, 0x3000 ; GCN-NEXT: v_writelane_b32 v42, s16, 2 +; GCN-NEXT: v_writelane_b32 v42, s34, 3 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x3000 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 @@ -55,11 +57,12 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: v_readlane_b32 s31, v42, 1 ; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: v_readlane_b32 s4, v42, 2 +; GCN-NEXT: v_readlane_b32 s34, v42, 3 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xd000 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll index 3922b5404d778..6684262f5c976 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -28,11 +28,11 @@ define void @callee_with_stack_and_call() #0 { ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 +; SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 ; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] -; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] @@ -87,7 +87,7 @@ define void @callee_with_stack_and_call() #0 { ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xf800 +; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s4, v0 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 6fd2c5a1267fb..32f255df82499 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -1680,11 +1680,11 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1710,11 +1710,11 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2092,11 +2092,11 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2134,11 +2134,11 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll index bb0e116cb4d32..1ad365df2e8a8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -74,10 +74,10 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: v_readlane_b32 s6, v40, 2 ; SDAG-NEXT: v_readlane_b32 s5, v40, 1 ; SDAG-NEXT: v_readlane_b32 s4, v40, 0 +; SDAG-NEXT: s_mov_b32 s32, s33 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] -; SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -151,10 +151,10 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: v_readlane_b32 s6, v40, 2 ; GISEL-NEXT: v_readlane_b32 s5, v40, 1 ; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 5ccbc85f46dd4..2e3ca34af4c74 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -142,11 +142,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -171,12 +171,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -201,11 +201,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -230,12 +230,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -265,11 +265,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -296,12 +296,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -327,11 +327,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -358,12 +358,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -394,11 +394,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -425,12 +425,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -456,11 +456,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -487,12 +487,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -520,11 +520,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -548,12 +548,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -577,11 +577,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -605,12 +605,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -638,11 +638,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -667,12 +667,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -697,11 +697,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -726,12 +726,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -760,11 +760,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -789,12 +789,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -819,11 +819,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -848,12 +848,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -881,11 +881,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -909,12 +909,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -938,11 +938,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -966,12 +966,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -999,11 +999,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1028,12 +1028,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1058,11 +1058,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1087,12 +1087,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1121,11 +1121,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1150,12 +1150,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1180,11 +1180,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1209,12 +1209,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1242,11 +1242,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1270,12 +1270,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1299,11 +1299,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1327,12 +1327,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1360,11 +1360,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1389,12 +1389,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1418,11 +1418,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1447,12 +1447,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1481,11 +1481,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1511,12 +1511,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1542,11 +1542,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1572,12 +1572,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1608,11 +1608,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1639,12 +1639,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1669,11 +1669,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1700,12 +1700,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1736,11 +1736,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1768,12 +1768,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1799,11 +1799,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1831,12 +1831,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1872,11 +1872,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1906,12 +1906,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1938,11 +1938,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1972,12 +1972,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2006,11 +2006,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2034,12 +2034,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2063,11 +2063,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2091,12 +2091,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2123,11 +2123,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2151,12 +2151,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2180,11 +2180,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2208,12 +2208,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2241,11 +2241,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2270,12 +2270,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2299,11 +2299,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2328,12 +2328,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2362,11 +2362,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2392,12 +2392,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2422,11 +2422,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2452,12 +2452,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2488,11 +2488,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2520,12 +2520,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2551,11 +2551,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2583,12 +2583,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2616,11 +2616,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2645,12 +2645,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2674,11 +2674,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2703,12 +2703,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2738,11 +2738,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2769,12 +2769,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2799,11 +2799,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2830,12 +2830,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2867,11 +2867,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2900,12 +2900,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2931,11 +2931,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2964,12 +2964,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3001,11 +3001,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3034,12 +3034,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3068,11 +3068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3101,12 +3101,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3139,11 +3139,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3172,12 +3172,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3205,11 +3205,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3238,12 +3238,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3277,11 +3277,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3311,12 +3311,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3345,11 +3345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3379,12 +3379,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3420,11 +3420,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3456,12 +3456,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3492,11 +3492,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3528,12 +3528,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3572,11 +3572,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3611,12 +3611,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3649,11 +3649,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3688,12 +3688,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3764,11 +3764,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3836,12 +3836,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3902,11 +3902,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3974,12 +3974,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4016,11 +4016,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4052,12 +4052,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4089,11 +4089,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4125,12 +4125,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4173,11 +4173,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4209,6 +4209,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[40:41], v0, off @@ -4219,7 +4220,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4253,8 +4253,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b16 v[40:41], v0, off ; GFX11-NEXT: s_clause 0x1 @@ -4263,7 +4263,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4295,6 +4294,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off @@ -4305,7 +4305,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4351,11 +4350,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4390,6 +4389,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: global_store_byte v[3:4], v2, off ; GFX10-NEXT: global_store_short v[40:41], v0, off @@ -4400,7 +4400,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4437,6 +4436,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off @@ -4447,7 +4447,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4482,6 +4481,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: global_store_byte v[3:4], v2, off ; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off @@ -4492,7 +4492,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4539,11 +4538,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4577,9 +4576,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 2 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_clause 0x1 @@ -4589,7 +4589,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4627,9 +4626,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4641,7 +4641,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4675,9 +4674,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 @@ -4687,7 +4687,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4739,11 +4738,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4779,11 +4778,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 2 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_byte v[0:1], v4, off ; GFX10-NEXT: global_store_dword v[40:41], v2, off @@ -4794,7 +4794,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4834,9 +4833,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX11-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -4851,7 +4851,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4887,11 +4886,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v4, off ; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v2, off @@ -4902,7 +4902,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4959,11 +4958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5008,8 +5007,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 2 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dwordx2 v[40:41], v[0:1], off ; GFX10-NEXT: s_clause 0x1 @@ -5019,7 +5019,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5072,8 +5071,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX11-NEXT: global_store_b64 v[40:41], v[0:1], off @@ -5083,7 +5083,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5128,8 +5127,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_dwordx2 v[40:41], v[0:1], off ; GFX10-SCRATCH-NEXT: s_clause 0x1 @@ -5139,7 +5139,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5263,11 +5262,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5386,12 +5385,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5537,11 +5536,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5660,12 +5659,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v44, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5696,11 +5695,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5724,12 +5723,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5753,11 +5752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5781,12 +5780,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5814,11 +5813,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5842,12 +5841,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5871,11 +5870,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5899,12 +5898,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5932,11 +5931,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5960,12 +5959,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5989,11 +5988,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6017,12 +6016,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6051,11 +6050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6080,12 +6079,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6109,11 +6108,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6138,12 +6137,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6171,11 +6170,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6200,12 +6199,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6230,11 +6229,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6259,12 +6258,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6291,11 +6290,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6319,12 +6318,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6348,11 +6347,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6376,12 +6375,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6410,11 +6409,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6439,12 +6438,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6469,11 +6468,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6498,12 +6497,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6530,11 +6529,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6558,12 +6557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6587,11 +6586,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6615,12 +6614,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6648,11 +6647,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6676,12 +6675,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6705,11 +6704,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6733,12 +6732,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6767,11 +6766,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6796,12 +6795,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6825,11 +6824,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6854,12 +6853,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6888,11 +6887,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6918,12 +6917,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6948,11 +6947,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6978,12 +6977,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7013,11 +7012,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7044,12 +7043,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7074,11 +7073,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7105,12 +7104,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7137,11 +7136,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7165,12 +7164,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7194,11 +7193,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7222,12 +7221,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7258,11 +7257,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7289,12 +7288,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7319,11 +7318,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7350,12 +7349,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7386,11 +7385,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7418,12 +7417,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7449,11 +7448,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7481,12 +7480,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7517,11 +7516,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7550,12 +7549,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7584,11 +7583,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7617,12 +7616,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7658,11 +7657,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7693,12 +7692,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7725,11 +7724,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7760,12 +7759,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7798,11 +7797,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7833,12 +7832,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7869,11 +7868,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7904,12 +7903,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7949,11 +7948,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7988,12 +7987,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8028,11 +8027,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8067,12 +8066,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8115,11 +8114,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8157,12 +8156,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8199,11 +8198,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8241,12 +8240,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8284,11 +8283,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8321,12 +8320,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8358,11 +8357,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8395,12 +8394,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8432,11 +8431,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8465,12 +8464,12 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8499,11 +8498,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8532,12 +8531,12 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8570,11 +8569,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8602,12 +8601,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8635,11 +8634,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8667,12 +8666,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8712,6 +8711,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off @@ -8721,7 +8721,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8754,6 +8753,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off @@ -8764,7 +8764,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8795,6 +8794,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-NEXT: scratch_load_b32 v1, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc @@ -8804,7 +8804,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8837,6 +8836,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off @@ -8847,7 +8847,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8909,11 +8908,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8960,12 +8959,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9008,11 +9007,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9059,12 +9058,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9158,10 +9157,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9251,11 +9250,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9341,10 +9340,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: v_readlane_b32 s34, v40, 2 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9431,11 +9430,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9466,11 +9465,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9495,12 +9494,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9525,11 +9524,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9554,12 +9553,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9588,11 +9587,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9618,12 +9617,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9649,11 +9648,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9679,12 +9678,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9713,11 +9712,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9743,12 +9742,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9774,11 +9773,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9804,12 +9803,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9838,11 +9837,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9868,12 +9867,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9899,11 +9898,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9929,12 +9928,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9966,11 +9965,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9999,12 +9998,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10033,11 +10032,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10066,12 +10065,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10107,11 +10106,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10144,12 +10143,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10182,11 +10181,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10219,12 +10218,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10263,11 +10262,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10302,12 +10301,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10342,11 +10341,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10381,12 +10380,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10428,11 +10427,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10471,12 +10470,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10515,11 +10514,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10558,12 +10557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10614,11 +10613,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10663,12 +10662,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10713,11 +10712,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10762,12 +10761,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10798,11 +10797,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10828,12 +10827,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10859,11 +10858,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10889,12 +10888,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10923,11 +10922,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10953,12 +10952,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10984,11 +10983,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11014,12 +11013,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11051,11 +11050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11084,12 +11083,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11118,11 +11117,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11151,12 +11150,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11191,11 +11190,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11227,12 +11226,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11264,11 +11263,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11300,12 +11299,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11346,11 +11345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11388,12 +11387,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11431,11 +11430,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11473,12 +11472,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11510,11 +11509,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11543,12 +11542,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11577,11 +11576,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11610,12 +11609,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11653,11 +11652,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11692,12 +11691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11732,11 +11731,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11771,12 +11770,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11820,11 +11819,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11865,12 +11864,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11911,11 +11910,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11956,12 +11955,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11990,11 +11989,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12020,12 +12019,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12051,11 +12050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12081,12 +12080,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12118,11 +12117,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12150,12 +12149,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12183,11 +12182,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12215,12 +12214,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12252,11 +12251,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12284,12 +12283,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12317,11 +12316,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12349,12 +12348,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12387,11 +12386,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12420,12 +12419,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12454,11 +12453,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12487,12 +12486,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12524,11 +12523,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12557,12 +12556,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12591,11 +12590,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12624,12 +12623,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12660,11 +12659,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12692,12 +12691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12725,11 +12724,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12757,12 +12756,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12795,11 +12794,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12828,12 +12827,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12862,11 +12861,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12895,12 +12894,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12929,11 +12928,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12959,12 +12958,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12990,11 +12989,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13020,12 +13019,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13057,11 +13056,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13089,12 +13088,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13122,11 +13121,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13154,12 +13153,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13192,11 +13191,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13225,12 +13224,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13259,11 +13258,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13292,12 +13291,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13332,11 +13331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13368,12 +13367,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13405,11 +13404,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13441,12 +13440,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13484,11 +13483,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13523,12 +13522,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13563,11 +13562,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13602,12 +13601,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13642,11 +13641,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13678,12 +13677,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13715,11 +13714,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13751,12 +13750,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13795,11 +13794,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13834,12 +13833,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13874,11 +13873,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13913,12 +13912,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13959,11 +13958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14001,12 +14000,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14044,11 +14043,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14086,12 +14085,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14136,11 +14135,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14182,12 +14181,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14229,11 +14228,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14275,12 +14274,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14332,11 +14331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14383,12 +14382,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14435,11 +14434,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14486,12 +14485,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14552,11 +14551,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 18 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14614,12 +14613,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 18 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14677,11 +14676,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 18 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14739,12 +14738,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14851,11 +14850,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 28 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14958,12 +14957,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 28 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15061,11 +15060,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 28 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15165,12 +15164,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -15282,11 +15281,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 28 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15394,12 +15393,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 28 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15501,11 +15500,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 28 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15611,12 +15610,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -15651,11 +15650,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15685,12 +15684,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15715,11 +15714,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15745,12 +15744,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -15817,11 +15816,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15884,12 +15883,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15931,11 +15930,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15995,12 +15994,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16087,11 +16086,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16162,12 +16161,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16213,11 +16212,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16283,12 +16282,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16371,11 +16370,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16446,12 +16445,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16503,11 +16502,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16573,12 +16572,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16613,11 +16612,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16640,12 +16639,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16668,11 +16667,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16695,12 +16694,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16727,11 +16726,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16754,12 +16753,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16782,11 +16781,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16809,12 +16808,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16841,11 +16840,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16868,12 +16867,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16896,11 +16895,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16923,12 +16922,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16955,11 +16954,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16982,12 +16981,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17010,11 +17009,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17037,12 +17036,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17069,11 +17068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17096,12 +17095,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17124,11 +17123,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17151,12 +17150,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17183,11 +17182,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17210,12 +17209,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17238,11 +17237,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17265,12 +17264,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17297,11 +17296,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17324,12 +17323,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17352,11 +17351,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17379,12 +17378,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17411,11 +17410,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17438,12 +17437,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17466,11 +17465,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17493,12 +17492,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17525,11 +17524,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17552,12 +17551,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17580,11 +17579,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17607,12 +17606,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17639,11 +17638,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17666,12 +17665,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17694,11 +17693,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17721,12 +17720,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17753,11 +17752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17780,12 +17779,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17808,11 +17807,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17835,12 +17834,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17867,11 +17866,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17894,12 +17893,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17922,11 +17921,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17949,12 +17948,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17981,11 +17980,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -18008,12 +18007,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -18036,11 +18035,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18063,12 +18062,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -18095,11 +18094,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -18122,12 +18121,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -18150,11 +18149,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18177,12 +18176,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll index a14e3d5673f82..4afc2fc972a28 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -30,11 +30,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -64,12 +64,12 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -227,11 +227,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -264,12 +264,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -301,11 +301,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -343,11 +343,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -380,12 +380,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -418,11 +418,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -461,11 +461,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -498,12 +498,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -536,11 +536,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -578,11 +578,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -615,12 +615,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -653,11 +653,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -693,11 +693,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -728,12 +728,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -763,11 +763,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -911,11 +911,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -938,12 +938,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -966,11 +966,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -996,11 +996,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1023,12 +1023,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1051,11 +1051,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1090,11 +1090,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1126,12 +1126,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1163,11 +1163,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1213,11 +1213,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: v_readlane_b32 s31, v41, 2 ; GFX9-NEXT: v_readlane_b32 s30, v41, 1 ; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1258,12 +1258,12 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: v_readlane_b32 s31, v41, 2 ; GFX10-NEXT: v_readlane_b32 s30, v41, 1 ; GFX10-NEXT: v_readlane_b32 s4, v41, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1303,11 +1303,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX11-NEXT: v_readlane_b32 s31, v41, 2 ; GFX11-NEXT: v_readlane_b32 s30, v41, 1 ; GFX11-NEXT: v_readlane_b32 s4, v41, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index c3ab9c23d1950..6384fdba7a45a 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -36,10 +36,10 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -61,11 +61,11 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -87,10 +87,10 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -132,10 +132,10 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -157,11 +157,11 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -183,10 +183,10 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -228,10 +228,10 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -253,11 +253,11 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -279,10 +279,10 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -332,10 +332,10 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -357,11 +357,11 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -383,10 +383,10 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -746,10 +746,10 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v100, 1 ; GFX9-NEXT: v_readlane_b32 s30, v100, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xdc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -836,11 +836,11 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:124 ; GFX10-NEXT: v_readlane_b32 s31, v100, 1 ; GFX10-NEXT: v_readlane_b32 s30, v100, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xee00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -927,10 +927,10 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:124 ; GFX11-NEXT: v_readlane_b32 s31, v100, 1 ; GFX11-NEXT: v_readlane_b32 s30, v100, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xff70 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2130,61 +2130,67 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX9-LABEL: call_512xi32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, s33 +; GFX9-NEXT: s_mov_b32 s35, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: s_mov_b32 s35, return_512xi32@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, return_512xi32@abs32@lo +; GFX9-NEXT: s_mov_b32 s37, return_512xi32@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, return_512xi32@abs32@lo ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: s_mov_b32 s38, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x60000 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_mov_b32 s34, s38 +; GFX9-NEXT: s_xor_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 -; GFX9-NEXT: s_mov_b32 s33, s36 +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_mov_b32 s33, s35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: call_512xi32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s35, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_mov_b32 s35, return_512xi32@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, return_512xi32@abs32@lo +; GFX10-NEXT: s_mov_b32 s37, return_512xi32@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, return_512xi32@abs32@lo +; GFX10-NEXT: s_mov_b32 s38, s34 +; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x30000 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 +; GFX10-NEXT: s_mov_b32 s32, s34 +; GFX10-NEXT: s_mov_b32 s34, s38 +; GFX10-NEXT: s_xor_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 -; GFX10-NEXT: s_mov_b32 s33, s36 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 +; GFX10-NEXT: s_mov_b32 s33, s35 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: call_512xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s35, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x7ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffff800 @@ -2195,17 +2201,20 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX11-NEXT: v_mov_b32_e32 v0, s33 ; GFX11-NEXT: s_mov_b32 s1, return_512xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_512xi32@abs32@lo +; GFX11-NEXT: s_mov_b32 s36, s34 +; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0x1800 ; GFX11-NEXT: v_writelane_b32 v5, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0 +; GFX11-NEXT: s_mov_b32 s32, s34 +; GFX11-NEXT: s_mov_b32 s34, s36 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 offset:2048 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xe800 -; GFX11-NEXT: s_mov_b32 s33, s34 +; GFX11-NEXT: s_mov_b32 s33, s35 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -2619,12 +2628,14 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-LABEL: call_72xi32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, s33 +; GFX9-NEXT: s_mov_b32 s35, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_mov_b32 s38, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill @@ -2685,8 +2696,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 -; GFX9-NEXT: s_mov_b32 s35, return_72xi32@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, return_72xi32@abs32@lo +; GFX9-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo ; GFX9-NEXT: v_add_u32_e32 v0, 0x200, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2720,7 +2731,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 @@ -2859,7 +2870,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -2877,25 +2888,28 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_mov_b32 s34, s38 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 -; GFX9-NEXT: s_mov_b32 s33, s36 +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_mov_b32 s33, s35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: call_72xi32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s35, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0x3fe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: s_or_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s38, s34 +; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill @@ -2912,8 +2926,6 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v63, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2956,11 +2968,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_writelane_b32 v63, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2987,10 +3001,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v29, 0 ; GFX10-NEXT: v_mov_b32_e32 v30, 0 ; GFX10-NEXT: v_mov_b32_e32 v31, 0 -; GFX10-NEXT: s_mov_b32 s35, return_72xi32@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, return_72xi32@abs32@lo +; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_clause 0x28 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 @@ -3133,7 +3147,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_clause 0xe ; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 @@ -3152,19 +3166,20 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; GFX10-NEXT: v_readlane_b32 s31, v63, 1 ; GFX10-NEXT: v_readlane_b32 s30, v63, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: s_mov_b32 s32, s34 +; GFX10-NEXT: s_mov_b32 s34, s38 +; GFX10-NEXT: s_or_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_add_i32 s32, s32, 0xfffec000 -; GFX10-NEXT: s_mov_b32 s33, s36 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 +; GFX10-NEXT: s_mov_b32 s33, s35 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: call_72xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s35, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 @@ -3178,6 +3193,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s36, s34 +; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 ; GFX11-NEXT: s_clause 0xb ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 @@ -3347,11 +3364,12 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44 ; GFX11-NEXT: v_readlane_b32 s31, v60, 1 ; GFX11-NEXT: v_readlane_b32 s30, v60, 0 +; GFX11-NEXT: s_mov_b32 s32, s34 +; GFX11-NEXT: s_mov_b32 s34, s36 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xf600 -; GFX11-NEXT: s_mov_b32 s33, s34 +; GFX11-NEXT: s_mov_b32 s33, s35 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll index 334e6e2b617e0..d8df20eb69452 100644 --- a/llvm/test/CodeGen/AMDGPU/global-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll @@ -37,11 +37,11 @@ define void @bar() { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index da8aa54469835..55da485b91f67 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -193,11 +193,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -277,11 +277,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -368,11 +368,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -453,11 +453,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -543,11 +543,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -629,11 +629,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -729,11 +729,11 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 20 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -824,11 +824,11 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 20 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -930,10 +930,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1024,10 +1024,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1126,10 +1126,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s31, v41, 1 ; GCN-NEXT: v_readlane_b32 s30, v41, 0 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1224,10 +1224,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s31, v41, 1 ; GISEL-NEXT: v_readlane_b32 s30, v41, 0 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1328,10 +1328,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1424,10 +1424,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1522,10 +1522,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1615,10 +1615,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 5c5a769178dd9..ea3d57d127151 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -36,10 +36,10 @@ define void @f0() { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v4, 1 ; GFX11-NEXT: v_readlane_b32 s30, v4, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index eef51acc4e12e..1f518386c63d5 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -41,11 +41,11 @@ define fastcc i32 @foo() { ; CHECK-NEXT: bb.2.DummyReturnBlock: ; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1 ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0 + ; CHECK-NEXT: $sgpr32 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2 ; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADDK_I32 $sgpr32, -512, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr4 ; CHECK-NEXT: S_WAITCNT 16240 ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit undef $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 29fbb0bb1c6c9..0edc7cb01887b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -112,10 +112,12 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 +; MUBUF-NEXT: s_mov_b32 s6, s34 ; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x2000 ; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 ; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -141,7 +143,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe00000 +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s6 ; MUBUF-NEXT: s_mov_b32 s33, s5 ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc @@ -155,6 +158,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_mov_b32 s2, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; FLATSCR-NEXT: s_mov_b32 s3, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x2000 @@ -179,7 +184,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_addk_i32 s32, 0x8000 +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s3 ; FLATSCR-NEXT: s_mov_b32 s33, s2 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index b77c3a9bb532b..8e436b327cda1 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -439,6 +439,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 @@ -455,7 +456,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_1-NEXT: s_mov_b32 s33, s5 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] @@ -473,6 +473,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 @@ -488,7 +489,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880 ; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_3-NEXT: s_mov_b32 s33, s5 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -513,7 +513,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc @@ -523,7 +523,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: s_add_i32 s2, s33, 0x4044 ; GFX11-NEXT: scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xbf80 ; GFX11-NEXT: s_mov_b32 s33, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -558,11 +557,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: v_readlane_b32 s59, v1, 0 +; GFX12-NEXT: s_mov_b32 s32, s33 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -593,11 +592,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_readlane_b32 s59, v1, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s7, s33, 0x101100 ; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -626,11 +625,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s59, v1, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s7, s33, 0x101100 ; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX900-NEXT: s_mov_b32 s33, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -660,11 +659,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX940-NEXT: ; use s59, scc ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_readlane_b32 s59, v1, 0 +; GFX940-NEXT: s_mov_b32 s32, s33 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s3, s33, 0x4044 ; GFX940-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_addk_i32 s32, 0xbf80 ; GFX940-NEXT: s_mov_b32 s33, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1027,6 +1026,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 ; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_1-NEXT: ;;#ASMSTART @@ -1038,7 +1038,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_1-NEXT: s_mov_b32 s33, s5 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] @@ -1056,6 +1055,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 ; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_3-NEXT: ;;#ASMSTART @@ -1066,7 +1066,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 ; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_3-NEXT: s_mov_b32 s33, s5 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -1084,9 +1083,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_addc_u32 s0, s33, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc @@ -1096,7 +1096,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 ; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xbf80 ; GFX11-NEXT: s_mov_b32 s33, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1122,11 +1121,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 +; GFX12-NEXT: s_mov_b32 s32, s33 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1152,11 +1151,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_readlane_b32 s59, v0, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s7, s33, 0x101000 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1180,11 +1179,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s59, v0, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s7, s33, 0x101000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX900-NEXT: s_mov_b32 s33, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1209,11 +1208,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX940-NEXT: ; use s59, scc ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_readlane_b32 s59, v0, 0 +; GFX940-NEXT: s_mov_b32 s32, s33 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s3, s33, 0x4040 ; GFX940-NEXT: scratch_load_dword v0, off, s3 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_addk_i32 s32, 0xbf80 ; GFX940-NEXT: s_mov_b32 s33, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1236,6 +1235,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_lshr_b32 s59, s33, 5 +; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: s_add_i32 s59, s59, 64 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 @@ -1246,7 +1246,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_1-NEXT: s_mov_b32 s33, s4 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] @@ -1263,6 +1262,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_lshr_b32 s59, s33, 5 +; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: s_add_i32 s59, s59, 64 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 @@ -1272,7 +1272,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 ; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_3-NEXT: s_mov_b32 s33, s4 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -1289,7 +1288,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_addk_i32 s32, 0x4080 ; GFX11-NEXT: s_add_i32 s1, s33, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_mov_b32 s59, s1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 @@ -1299,7 +1298,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 ; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xbf80 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1318,18 +1316,17 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_writelane_b32 v0, s59, 0 -; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 +; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s32, s33 ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1352,11 +1349,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_readlane_b32 s59, v0, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX8-NEXT: s_add_i32 s5, s33, 0x101000 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX8-NEXT: s_mov_b32 s33, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1378,11 +1375,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s59, v0, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX900-NEXT: s_add_i32 s5, s33, 0x101000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[6:7] -; GFX900-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX900-NEXT: s_mov_b32 s33, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1404,11 +1401,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX940-NEXT: ; use s59 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_readlane_b32 s59, v0, 0 +; GFX940-NEXT: s_mov_b32 s32, s33 ; GFX940-NEXT: s_xor_saveexec_b64 s[2:3], -1 ; GFX940-NEXT: s_add_i32 s1, s33, 0x4040 ; GFX940-NEXT: scratch_load_dword v0, off, s1 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_addk_i32 s32, 0xbf80 ; GFX940-NEXT: s_mov_b32 s33, s0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 299bbdac60091..4bc7711f2f839 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -229,11 +229,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: v_readlane_b32 s34, v43, 2 ; GFX9-NEXT: v_readlane_b32 s31, v43, 1 ; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v43, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 0e750d879ac94..2d853212166e9 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -12,7 +12,7 @@ define hidden fastcc void @callee_has_fp() #1 { ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffe00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -50,10 +50,10 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -192,10 +192,10 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -226,10 +226,10 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s19 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index 64a94a5ee0e70..593f40fd1b25e 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -24,12 +24,12 @@ declare void @external_void_func_i32(i32) #0 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 +; GCN: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -44,7 +44,6 @@ define void @test_func_call_external_void_func_i32_imm() #0 { ; GCN-DAG: s_addk_i32 s32, 0x1400{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; GCN: s_swappc_b64 -; GCN: s_addk_i32 s32, 0xec00{{$}} ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { %alloca = alloca [16 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 9999cb9173b5d..25b7b043fc6b6 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -38,12 +38,11 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 2bd60e869f843..fb14f1844427e 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -246,7 +246,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_mov_b32 s33, s7 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; @@ -280,7 +280,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -316,8 +316,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_mov_b32 s7, s33 ; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 +; MUBUF-NEXT: s_mov_b32 s8, s34 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 +; MUBUF-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x2000 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz .LBB3_2 @@ -341,7 +343,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_addk_i32 s32, 0xe000 +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s8 ; MUBUF-NEXT: s_mov_b32 s33, s7 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; @@ -350,8 +353,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s3, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 63 +; FLATSCR-NEXT: s_mov_b32 s4, s34 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_addk_i32 s32, 0x80 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 @@ -373,7 +378,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_addk_i32 s32, 0xff80 +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s4 ; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir index a204866170759..ba6524caf668d 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -37,6 +37,8 @@ body: | ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -50,12 +52,13 @@ body: | ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -86,11 +89,17 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr - ; CHECK: liveins: $sgpr29, $vgpr1 + ; CHECK: liveins: $sgpr29, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -100,11 +109,16 @@ body: | ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 + ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -135,21 +149,28 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64 - ; CHECK: liveins: $sgpr28, $vgpr1 + ; CHECK: liveins: $sgpr28, $sgpr29, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr29 - ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr29 + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc + ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -179,11 +200,13 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc - ; CHECK: liveins: $sgpr28, $vgpr1 + ; CHECK: liveins: $sgpr28, $sgpr29, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31 ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -193,7 +216,8 @@ body: | ; CHECK-NEXT: $vcc_lo = S_MOV_B32 16384 ; CHECK-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28 ; CHECK-NEXT: S_ENDPGM 0 S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir index a4104737d974f..162d12f651d4a 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -33,6 +33,8 @@ body: | ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -40,12 +42,13 @@ body: | ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc ; @@ -60,6 +63,8 @@ body: | ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc @@ -68,12 +73,13 @@ body: | ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc + ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir index 45e95d133e1bb..a4f936a4d705c 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -32,6 +32,8 @@ body: | ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -40,12 +42,13 @@ body: | ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -786432, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir index 9462d01ba758d..63a4759d8e740 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -34,6 +34,8 @@ body: | ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192 @@ -42,12 +44,13 @@ body: | ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384 ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX8-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GFX8-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GFX8-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; GFX8-NEXT: $sgpr33 = COPY $sgpr4 ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -62,18 +65,21 @@ body: | ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; GFX9-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX9-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX9-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GFX9-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GFX9-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; GFX9-NEXT: $sgpr33 = COPY $sgpr4 ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -88,17 +94,20 @@ body: | ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; GFX9-FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr4, $vgpr1, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX9-FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GFX9-FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; GFX9-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 1126db9cae93f..20e5af1b87f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -340,6 +340,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 +; GFX906-NEXT: s_mov_b32 s32, s33 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 @@ -366,7 +367,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, -1 ; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[6:7] -; GFX906-NEXT: s_addk_i32 s32, 0xd800 ; GFX906-NEXT: s_mov_b32 s33, s4 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -752,8 +752,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX908-NEXT: s_mov_b32 s32, s33 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s4, v0 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload @@ -765,7 +766,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_addk_i32 s32, 0xd400 ; GFX908-NEXT: s_mov_b32 s33, s4 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index 648f4fc64f9d0..64a8f5484673f 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -20,8 +20,8 @@ ; CHECK-LABEL: {{^}}call_72xi32: -; GFX11-PAL: NumSgprs: 35 -; GFX11-PAL-GCNTRACKERS: NumSgprs: 35 +; GFX11-PAL: NumSgprs: 37 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 37 ; GFX11-PAL: NumVgprs: 64 ; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 ; GFX11-PAL: ScratchSize: 2780 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 94cbe568a6a44..925984b15367d 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -198,6 +198,7 @@ body: | ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2 ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GCN-NEXT: $sgpr32 = frame-destroy COPY $sgpr33 ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) @@ -205,7 +206,6 @@ body: | ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 - ; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi ; GCN-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index ff2202f1e177b..4a01962aa4084 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -265,10 +265,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -557,10 +557,10 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1788,7 +1788,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 5536a09538e6e..0676bc79a46f5 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -226,11 +226,11 @@ entry: ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 ; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 ; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 70bd63d31d5d7..fed60eecc8a8b 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -42,7 +42,7 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 { ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0xd800 +; GCN: s_mov_b32 s32, s34 ; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { @@ -63,7 +63,7 @@ define void @needs_align16_stack_align4(i32 %idx) #2 { ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0xd000 +; GCN: s_mov_b32 s32, s34 ; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { @@ -79,7 +79,7 @@ define void @needs_align32(i32 %idx) #0 { ; GCN: s_addk_i32 s32, 0xd00{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0xf300 +; GCN: s_mov_b32 s32, s34 ; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { @@ -127,10 +127,12 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_addk_i32 s32, 0x4000 ; GCN-NOT: s33 ; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} -; GCN: s_addk_i32 s32, 0xc000 +; GCN: s_mov_b32 s32, s34 ; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) @@ -175,12 +177,12 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 +; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) @@ -209,8 +211,8 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen ; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_mov_b32 s34, [[BP_COPY]] -; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 +; GCN: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 s[30:31] begin: diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index da99052ba69ba..8f16fcf6d0890 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -1282,16 +1282,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s19 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-OPT-NEXT: s_mov_b32 s32, s33 ; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-OPT-NEXT: s_addk_i32 s32, 0xee00 ; WAVE32-OPT-NEXT: s_mov_b32 s33, s20 ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] @@ -1317,16 +1317,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s19 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-OPT-NEXT: s_mov_b32 s32, s33 ; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-OPT-NEXT: s_addk_i32 s32, 0xdc00 ; WAVE64-OPT-NEXT: s_mov_b32 s33, s20 ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] @@ -1433,11 +1433,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: s_mov_b32 s32, s4 ; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-O0-NEXT: s_mov_b32 s32, s33 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 ; WAVE32-O0-NEXT: s_mov_b32 s33, s24 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] @@ -1544,11 +1544,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: s_mov_b32 s32, s4 ; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-O0-NEXT: s_mov_b32 s32, s33 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 ; WAVE64-O0-NEXT: s_mov_b32 s33, s19 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1655,11 +1655,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 3bf7fec81c041..ebd4bc881f2af 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -196,11 +196,11 @@ define void @outgoing_f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -232,11 +232,11 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -268,9 +268,10 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 -; GFX7-NEXT: v_readlane_b32 s4, v42, 2 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: flat_store_short v[40:41], v0 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload @@ -278,7 +279,6 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -315,8 +315,9 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s4, v42, 2 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -326,7 +327,6 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -383,11 +383,11 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -470,11 +470,11 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 { ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -528,11 +528,11 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll index cd6cb4d1e9fe4..242b5e9aeaf42 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll @@ -28,11 +28,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -64,11 +64,11 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index b678e3e87202a..d9df80ce6c1c0 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -110,11 +110,11 @@ define hidden void @widget() { ; GCN-NEXT: v_readlane_b32 s31, v41, 1 ; GCN-NEXT: v_readlane_b32 s30, v41, 0 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v41, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -458,11 +458,11 @@ define hidden void @blam() { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v45, 26 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir index 33fb595157256..8a0bf26f81d22 100644 --- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir +++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir @@ -47,6 +47,8 @@ body: | ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -67,12 +69,13 @@ body: | ; MUBUF-NEXT: bb.2: ; MUBUF-NEXT: liveins: $vgpr2 ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -11010048, implicit-def dead $scc ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 ; MUBUF-NEXT: S_ENDPGM 0 ; @@ -89,6 +92,8 @@ body: | ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc ; FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -118,12 +123,13 @@ body: | ; FLATSCR-NEXT: bb.2: ; FLATSCR-NEXT: liveins: $vgpr2 ; FLATSCR-NEXT: {{ $}} + ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -172032, implicit-def dead $scc ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; FLATSCR-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 2b96e10fd3cc3..6b9476af7a493 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -54,11 +54,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -111,12 +111,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s4, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -165,11 +165,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -238,11 +238,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v45, 1 ; GFX9-NEXT: v_readlane_b32 s30, v45, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v45, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -288,12 +288,12 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; GFX10-NEXT: v_readlane_b32 s31, v45, 1 ; GFX10-NEXT: v_readlane_b32 s30, v45, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s4, v45, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -337,11 +337,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; GFX11-NEXT: v_readlane_b32 s31, v45, 1 ; GFX11-NEXT: v_readlane_b32 s30, v45, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v45, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 4e17be1ebb312..0307472fce732 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2866,12 +2866,12 @@ define void @callee_no_stack_with_call() #1 { ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1032-NEXT: v_readlane_b32 s30, v40, 0 +; GFX1032-NEXT: s_mov_b32 s32, s33 ; GFX1032-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1032-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_mov_b32 exec_lo, s5 -; GFX1032-NEXT: s_addk_i32 s32, 0xfe00 ; GFX1032-NEXT: s_mov_b32 s33, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: s_setpc_b64 s[30:31] @@ -2897,12 +2897,12 @@ define void @callee_no_stack_with_call() #1 { ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1064-NEXT: v_readlane_b32 s30, v40, 0 +; GFX1064-NEXT: s_mov_b32 s32, s33 ; GFX1064-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: s_addk_i32 s32, 0xfc00 ; GFX1064-NEXT: s_mov_b32 s33, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll index d1ee82e74b3de..cb3a0e1ebb553 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -47,6 +47,7 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-NEXT: s_mov_b32 s32, s33 ; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 ; GFX90A-NEXT: v_readlane_b32 s28, v40, 2 ; GFX90A-NEXT: v_readlane_b32 s29, v40, 3 @@ -56,7 +57,6 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_addk_i32 s32, 0xfc00 ; GFX90A-NEXT: s_mov_b32 s33, s4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 4837efe6606b8..766386d84a616 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -53,6 +53,7 @@ define void @test() #0 { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-NEXT: v_readlane_b32 s29, v40, 3 @@ -61,7 +62,6 @@ define void @test() #0 { ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -113,6 +113,7 @@ define void @test() #0 { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-O0-NEXT: s_mov_b32 s32, s33 ; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 @@ -121,7 +122,6 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 exec, -1 ; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GCN-O0-NEXT: s_mov_b32 s33, s4 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 1089093ea691c..2e59a36adb7e4 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -390,12 +390,12 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O0-NEXT: s_mov_b32 s32, s33 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s48 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -428,12 +428,12 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O3-NEXT: s_mov_b32 s32, s33 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -636,6 +636,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 +; GFX9-O0-NEXT: s_mov_b32 s32, s33 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -650,7 +651,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 ; GFX9-O0-NEXT: s_mov_b32 s33, s46 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -698,6 +698,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 +; GFX9-O3-NEXT: s_mov_b32 s32, s33 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -707,7 +708,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected index d1500e002d7e9..429bee4195fa9 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected @@ -101,7 +101,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -138,7 +138,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected index deadc4adb02c5..842fd8836da7e 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected @@ -42,7 +42,7 @@ define dso_local i32 @check_boundaries() #0 { ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -115,7 +115,7 @@ define dso_local i32 @main() #0 { ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] From 970094d50b08e694c2302f7ee39b1c33d08f2405 Mon Sep 17 00:00:00 2001 From: Lou Date: Fri, 24 Jan 2025 15:08:14 +0100 Subject: [PATCH 005/432] [llvm-opt-report] Show scalable vectorization factors (#123367) Scalable vectorization factors are printed as "vscale x VF" where VF is the known minimum number of elements, a integer. Currently, llvm-opt-report always expects a integer (like for vectorization with fixed-sized vectors), and does not display any vectorization factor in the output (just 'V', but without a number). This patch adds support for scalable vectorization factors and prints them as "VNx", so for example "VNx4". The "Nx" is used to differentiate between fixed-sized and scalable factors, and is consistent with the way LLVM mangles scalable vectors in other places. --- .../tools/llvm-opt-report/Inputs/scalable.c | 9 ++++ .../llvm-opt-report/Inputs/scalable.yaml | 12 +++++ llvm/test/tools/llvm-opt-report/scalabe.test | 12 +++++ llvm/tools/llvm-opt-report/OptReport.cpp | 44 ++++++++++++++----- 4 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 llvm/test/tools/llvm-opt-report/Inputs/scalable.c create mode 100644 llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml create mode 100644 llvm/test/tools/llvm-opt-report/scalabe.test diff --git a/llvm/test/tools/llvm-opt-report/Inputs/scalable.c b/llvm/test/tools/llvm-opt-report/Inputs/scalable.c new file mode 100644 index 0000000000000..d2fa6fb879c1f --- /dev/null +++ b/llvm/test/tools/llvm-opt-report/Inputs/scalable.c @@ -0,0 +1,9 @@ +#include + +void foo(size_t N, float A[restrict N], float B[N]) { + #pragma clang loop vectorize_width(4, scalable) + for (size_t i = 0; i < N; i++) { + A[i] = B[i] * 42.f; + } +} + diff --git a/llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml b/llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml new file mode 100644 index 0000000000000..7f248c57faa6c --- /dev/null +++ b/llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml @@ -0,0 +1,12 @@ +--- !Passed +Pass: loop-vectorize +Name: Vectorized +DebugLoc: { File: './Inputs/scalable.c', Line: 5, Column: 3 } +Function: foo +Args: + - String: 'vectorized loop (vectorization width: ' + - VectorizationFactor: vscale x 4 + - String: ', interleaved count: ' + - InterleaveCount: '2' + - String: ')' +... diff --git a/llvm/test/tools/llvm-opt-report/scalabe.test b/llvm/test/tools/llvm-opt-report/scalabe.test new file mode 100644 index 0000000000000..c853c57c46b2b --- /dev/null +++ b/llvm/test/tools/llvm-opt-report/scalabe.test @@ -0,0 +1,12 @@ +RUN: llvm-opt-report -r %p %p/Inputs/scalable.yaml | FileCheck -strict-whitespace %s + +; CHECK: < {{.*[/\]}}scalable.c +; CHECK-NEXT: 1 | #include +; CHECK-NEXT: 2 | +; CHECK-NEXT: 3 | void foo(size_t N, float A[restrict N], float B[N]) { +; CHECK-NEXT: 4 | #pragma clang loop vectorize_width(4, scalable) +; CHECK-NEXT: 5 VNx4,2 | for (size_t i = 0; i < N; i++) { +; CHECK-NEXT: 6 | A[i] = B[i] * 42.f; +; CHECK-NEXT: 7 | } +; CHECK-NEXT: 8 | } +; CHECK-NEXT: 9 | diff --git a/llvm/tools/llvm-opt-report/OptReport.cpp b/llvm/tools/llvm-opt-report/OptReport.cpp index cee9abcb49419..68ed92c8bacea 100644 --- a/llvm/tools/llvm-opt-report/OptReport.cpp +++ b/llvm/tools/llvm-opt-report/OptReport.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" #include @@ -100,7 +101,7 @@ struct OptReportLocationInfo { OptReportLocationItemInfo Unrolled; OptReportLocationItemInfo Vectorized; - int VectorizationFactor = 1; + ElementCount VectorizationFactor = ElementCount::getFixed(1); int InterleaveCount = 1; int UnrollCount = 1; @@ -109,8 +110,9 @@ struct OptReportLocationInfo { Unrolled |= RHS.Unrolled; Vectorized |= RHS.Vectorized; - VectorizationFactor = - std::max(VectorizationFactor, RHS.VectorizationFactor); + if (ElementCount::isKnownLT(VectorizationFactor, RHS.VectorizationFactor)) + VectorizationFactor = RHS.VectorizationFactor; + InterleaveCount = std::max(InterleaveCount, RHS.InterleaveCount); UnrollCount = std::max(UnrollCount, RHS.UnrollCount); @@ -130,9 +132,11 @@ struct OptReportLocationInfo { return true; else if (RHS.Vectorized < Vectorized || Succinct) return false; - else if (VectorizationFactor < RHS.VectorizationFactor) + else if (ElementCount::isKnownLT(VectorizationFactor, + RHS.VectorizationFactor)) return true; - else if (VectorizationFactor > RHS.VectorizationFactor) + else if (ElementCount::isKnownGT(VectorizationFactor, + RHS.VectorizationFactor)) return false; else if (InterleaveCount < RHS.InterleaveCount) return true; @@ -197,17 +201,26 @@ static bool readLocationInfo(LocationInfoTy &LocationInfo) { bool Transformed = Remark.RemarkType == remarks::Type::Passed; - int VectorizationFactor = 1; + ElementCount VectorizationFactor = ElementCount::getFixed(1); int InterleaveCount = 1; int UnrollCount = 1; for (const remarks::Argument &Arg : Remark.Args) { - if (Arg.Key == "VectorizationFactor") - Arg.Val.getAsInteger(10, VectorizationFactor); - else if (Arg.Key == "InterleaveCount") + if (Arg.Key == "VectorizationFactor") { + int MinValue = 1; + bool IsScalable = false; + if (Arg.Val.starts_with("vscale x ")) { + Arg.Val.drop_front(9).getAsInteger(10, MinValue); + IsScalable = true; + } else { + Arg.Val.getAsInteger(10, MinValue); + } + VectorizationFactor = ElementCount::get(MinValue, IsScalable); + } else if (Arg.Key == "InterleaveCount") { Arg.Val.getAsInteger(10, InterleaveCount); - else if (Arg.Key == "UnrollCount") + } else if (Arg.Key == "UnrollCount") { Arg.Val.getAsInteger(10, UnrollCount); + } } const std::optional &Loc = Remark.Loc; @@ -292,7 +305,11 @@ static bool writeReport(LocationInfoTy &LocationInfo) { bool NothingUnrolled = !MaxLI.Unrolled.Transformed; bool NothingVectorized = !MaxLI.Vectorized.Transformed; - unsigned VFDigits = llvm::utostr(MaxLI.VectorizationFactor).size(); + unsigned VFDigits = + llvm::utostr(MaxLI.VectorizationFactor.getKnownMinValue()).size(); + if (MaxLI.VectorizationFactor.isScalable()) + VFDigits += 2; // For "Nx..." + unsigned ICDigits = llvm::utostr(MaxLI.InterleaveCount).size(); unsigned UCDigits = llvm::utostr(MaxLI.UnrollCount).size(); @@ -382,7 +399,10 @@ static bool writeReport(LocationInfoTy &LocationInfo) { raw_string_ostream RS(R); if (!Succinct) { - RS << LLI.VectorizationFactor << "," << LLI.InterleaveCount; + if (LLI.VectorizationFactor.isScalable()) + RS << "Nx"; + RS << LLI.VectorizationFactor.getKnownMinValue() << "," + << LLI.InterleaveCount; RS << std::string(VFDigits + ICDigits + 1 - R.size(), ' '); } From e5e55c04d6af4ae32c99d574f59e632595abf607 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Fri, 24 Jan 2025 09:08:34 -0500 Subject: [PATCH 006/432] [GlobalMerge][NFC] Skip sorting by profitability when it is not needed (#124146) We were previously sorting by profitability even if we were choosing to merge all globals together, which is not impacted by UsedGlobalSet order. We can also remove iteration of UsedGlobalSets in reverse order in both cases. In the first csae, the order does not matter. In the second case, we just sort by the order we need instead of sorting in the opposite direction and calling reverse. This change should only be an improvement on compile time. I have not measured it, but I think it would never make things worse. --- llvm/lib/CodeGen/GlobalMerge.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 7b76155b175d1..41e01a1d3ccd5 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -423,24 +423,12 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, } } - // Now we found a bunch of sets of globals used together. We accumulated - // the number of times we encountered the sets (i.e., the number of functions - // that use that exact set of globals). - // - // Multiply that by the size of the set to give us a crude profitability - // metric. - llvm::stable_sort(UsedGlobalSets, - [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) { - return UGS1.Globals.count() * UGS1.UsageCount < - UGS2.Globals.count() * UGS2.UsageCount; - }); - // We can choose to merge all globals together, but ignore globals never used // with another global. This catches the obviously non-profitable cases of // having a single global, but is aggressive enough for any other case. if (GlobalMergeIgnoreSingleUse) { BitVector AllGlobals(Globals.size()); - for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) { + for (const UsedGlobalSet &UGS : UsedGlobalSets) { if (UGS.UsageCount == 0) continue; if (UGS.Globals.count() > 1) @@ -449,6 +437,16 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, return doMerge(Globals, AllGlobals, M, isConst, AddrSpace); } + // Now we found a bunch of sets of globals used together. We accumulated + // the number of times we encountered the sets (i.e., the number of functions + // that use that exact set of globals). Multiply that by the size of the set + // to give us a crude profitability metric. + llvm::stable_sort(UsedGlobalSets, + [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) { + return UGS1.Globals.count() * UGS1.UsageCount >= + UGS2.Globals.count() * UGS2.UsageCount; + }); + // Starting from the sets with the best (=biggest) profitability, find a // good combination. // The ideal (and expensive) solution can only be found by trying all @@ -458,7 +456,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, BitVector PickedGlobals(Globals.size()); bool Changed = false; - for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) { + for (const UsedGlobalSet &UGS : UsedGlobalSets) { if (UGS.UsageCount == 0) continue; if (PickedGlobals.anyCommon(UGS.Globals)) From 77c780d64b950d6850d5ec1ee06cd0c21b38b89e Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 24 Jan 2025 15:11:13 +0100 Subject: [PATCH 007/432] [bazel] Port eb206e9ea84eff0a0596fed2de8316d924f946d1 Leave around an alias so users can move at their own pace. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 72c28faed1d16..3336daf6773b2 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7,6 +7,7 @@ load("@bazel_skylib//rules:common_settings.bzl", "bool_flag") load("@bazel_skylib//rules:expand_template.bzl", "expand_template") +load("//llvm:binary_alias.bzl", "binary_alias") load( ":build_defs.bzl", "cc_headers_only", @@ -10174,8 +10175,8 @@ cc_binary( ) cc_binary( - name = "mlir-cpu-runner", - srcs = ["tools/mlir-cpu-runner/mlir-cpu-runner.cpp"], + name = "mlir-runner", + srcs = ["tools/mlir-runner/mlir-runner.cpp"], deps = [ ":AllToLLVMIRTranslations", ":BuiltinToLLVMIRTranslation", @@ -10195,6 +10196,12 @@ cc_binary( ], ) +# TODO: Remove this alias. +binary_alias( + name = "mlir-cpu-runner", + binary = ":mlir-runner", +) + # This target provides the headers from LLVM's Support target without any of # the symbols. In particular, it does not contain the static registration code # which may be executed by at most one shared library loaded by ORCJit. Direct From acde3f722ff3766f6f793884108d342b78623fe4 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Fri, 24 Jan 2025 09:26:28 -0500 Subject: [PATCH 008/432] [mlir:python] Compute get_op_result_or_value in PyOpView's constructor. (#123953) This logic is in the critical path for constructing an operation from Python. It is faster to compute this in C++ than it is in Python, and it is a minor change to do this. This change also alters the API contract of _ods_common.get_op_results_or_values to avoid calling get_op_result_or_value on each element of a sequence, since the C++ code will now do this. Most of the diff here is simply reordering the code in IRCore.cpp. --- mlir/lib/Bindings/Python/IRCore.cpp | 432 ++++++++++-------- mlir/lib/Bindings/Python/IRModule.h | 2 +- mlir/python/mlir/dialects/_ods_common.py | 7 +- mlir/test/mlir-tblgen/op-python-bindings.td | 26 +- mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp | 9 +- 5 files changed, 255 insertions(+), 221 deletions(-) diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 738f1444b15fe..8e351cb22eb94 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -1481,12 +1481,11 @@ static void maybeInsertOperation(PyOperationRef &op, nb::object PyOperation::create(std::string_view name, std::optional> results, - std::optional> operands, + llvm::ArrayRef operands, std::optional attributes, std::optional> successors, int regions, DefaultingPyLocation location, const nb::object &maybeIp, bool inferType) { - llvm::SmallVector mlirOperands; llvm::SmallVector mlirResults; llvm::SmallVector mlirSuccessors; llvm::SmallVector, 4> mlirAttributes; @@ -1495,16 +1494,6 @@ nb::object PyOperation::create(std::string_view name, if (regions < 0) throw nb::value_error("number of regions must be >= 0"); - // Unpack/validate operands. - if (operands) { - mlirOperands.reserve(operands->size()); - for (PyValue *operand : *operands) { - if (!operand) - throw nb::value_error("operand value cannot be None"); - mlirOperands.push_back(operand->get()); - } - } - // Unpack/validate results. if (results) { mlirResults.reserve(results->size()); @@ -1562,9 +1551,8 @@ nb::object PyOperation::create(std::string_view name, // point, exceptions cannot be thrown or else the state will leak. MlirOperationState state = mlirOperationStateGet(toMlirStringRef(name), location); - if (!mlirOperands.empty()) - mlirOperationStateAddOperands(&state, mlirOperands.size(), - mlirOperands.data()); + if (!operands.empty()) + mlirOperationStateAddOperands(&state, operands.size(), operands.data()); state.enableResultTypeInference = inferType; if (!mlirResults.empty()) mlirOperationStateAddResults(&state, mlirResults.size(), @@ -1632,6 +1620,143 @@ void PyOperation::erase() { mlirOperationDestroy(operation); } +namespace { +/// CRTP base class for Python MLIR values that subclass Value and should be +/// castable from it. The value hierarchy is one level deep and is not supposed +/// to accommodate other levels unless core MLIR changes. +template +class PyConcreteValue : public PyValue { +public: + // Derived classes must define statics for: + // IsAFunctionTy isaFunction + // const char *pyClassName + // and redefine bindDerived. + using ClassTy = nb::class_; + using IsAFunctionTy = bool (*)(MlirValue); + + PyConcreteValue() = default; + PyConcreteValue(PyOperationRef operationRef, MlirValue value) + : PyValue(operationRef, value) {} + PyConcreteValue(PyValue &orig) + : PyConcreteValue(orig.getParentOperation(), castFrom(orig)) {} + + /// Attempts to cast the original value to the derived type and throws on + /// type mismatches. + static MlirValue castFrom(PyValue &orig) { + if (!DerivedTy::isaFunction(orig.get())) { + auto origRepr = nb::cast(nb::repr(nb::cast(orig))); + throw nb::value_error((Twine("Cannot cast value to ") + + DerivedTy::pyClassName + " (from " + origRepr + + ")") + .str() + .c_str()); + } + return orig.get(); + } + + /// Binds the Python module objects to functions of this class. + static void bind(nb::module_ &m) { + auto cls = ClassTy(m, DerivedTy::pyClassName); + cls.def(nb::init(), nb::keep_alive<0, 1>(), nb::arg("value")); + cls.def_static( + "isinstance", + [](PyValue &otherValue) -> bool { + return DerivedTy::isaFunction(otherValue); + }, + nb::arg("other_value")); + cls.def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](DerivedTy &self) { return self.maybeDownCast(); }); + DerivedTy::bindDerived(cls); + } + + /// Implemented by derived classes to add methods to the Python subclass. + static void bindDerived(ClassTy &m) {} +}; + +} // namespace + +/// Python wrapper for MlirOpResult. +class PyOpResult : public PyConcreteValue { +public: + static constexpr IsAFunctionTy isaFunction = mlirValueIsAOpResult; + static constexpr const char *pyClassName = "OpResult"; + using PyConcreteValue::PyConcreteValue; + + static void bindDerived(ClassTy &c) { + c.def_prop_ro("owner", [](PyOpResult &self) { + assert( + mlirOperationEqual(self.getParentOperation()->get(), + mlirOpResultGetOwner(self.get())) && + "expected the owner of the value in Python to match that in the IR"); + return self.getParentOperation().getObject(); + }); + c.def_prop_ro("result_number", [](PyOpResult &self) { + return mlirOpResultGetResultNumber(self.get()); + }); + } +}; + +/// Returns the list of types of the values held by container. +template +static std::vector getValueTypes(Container &container, + PyMlirContextRef &context) { + std::vector result; + result.reserve(container.size()); + for (int i = 0, e = container.size(); i < e; ++i) { + result.push_back(mlirValueGetType(container.getElement(i).get())); + } + return result; +} + +/// A list of operation results. Internally, these are stored as consecutive +/// elements, random access is cheap. The (returned) result list is associated +/// with the operation whose results these are, and thus extends the lifetime of +/// this operation. +class PyOpResultList : public Sliceable { +public: + static constexpr const char *pyClassName = "OpResultList"; + using SliceableT = Sliceable; + + PyOpResultList(PyOperationRef operation, intptr_t startIndex = 0, + intptr_t length = -1, intptr_t step = 1) + : Sliceable(startIndex, + length == -1 ? mlirOperationGetNumResults(operation->get()) + : length, + step), + operation(std::move(operation)) {} + + static void bindDerived(ClassTy &c) { + c.def_prop_ro("types", [](PyOpResultList &self) { + return getValueTypes(self, self.operation->getContext()); + }); + c.def_prop_ro("owner", [](PyOpResultList &self) { + return self.operation->createOpView(); + }); + } + + PyOperationRef &getOperation() { return operation; } + +private: + /// Give the parent CRTP class access to hook implementations below. + friend class Sliceable; + + intptr_t getRawNumElements() { + operation->checkValid(); + return mlirOperationGetNumResults(operation->get()); + } + + PyOpResult getRawElement(intptr_t index) { + PyValue value(operation, mlirOperationGetResult(operation->get(), index)); + return PyOpResult(value); + } + + PyOpResultList slice(intptr_t startIndex, intptr_t length, intptr_t step) { + return PyOpResultList(operation, startIndex, length, step); + } + + PyOperationRef operation; +}; + //------------------------------------------------------------------------------ // PyOpView //------------------------------------------------------------------------------ @@ -1733,6 +1858,40 @@ static void populateResultTypes(StringRef name, nb::list resultTypeList, } } +static MlirValue getUniqueResult(MlirOperation operation) { + auto numResults = mlirOperationGetNumResults(operation); + if (numResults != 1) { + auto name = mlirIdentifierStr(mlirOperationGetName(operation)); + throw nb::value_error((Twine("Cannot call .result on operation ") + + StringRef(name.data, name.length) + " which has " + + Twine(numResults) + + " results (it is only valid for operations with a " + "single result)") + .str() + .c_str()); + } + return mlirOperationGetResult(operation, 0); +} + +static MlirValue getOpResultOrValue(nb::handle operand) { + if (operand.is_none()) { + throw nb::value_error("contained a None item"); + } + PyOperationBase *op; + if (nb::try_cast(operand, op)) { + return getUniqueResult(op->getOperation()); + } + PyOpResultList *opResultList; + if (nb::try_cast(operand, opResultList)) { + return getUniqueResult(opResultList->getOperation()->get()); + } + PyValue *value; + if (nb::try_cast(operand, value)) { + return value->get(); + } + throw nb::value_error("is not a Value"); +} + nb::object PyOpView::buildGeneric( std::string_view name, std::tuple opRegionSpec, nb::object operandSegmentSpecObj, nb::object resultSegmentSpecObj, @@ -1783,16 +1942,14 @@ nb::object PyOpView::buildGeneric( } // Unpack operands. - std::vector operands; + llvm::SmallVector operands; operands.reserve(operands.size()); if (operandSegmentSpecObj.is_none()) { // Non-sized operand unpacking. for (const auto &it : llvm::enumerate(operandList)) { try { - operands.push_back(nb::cast(it.value())); - if (!operands.back()) - throw nb::cast_error(); - } catch (nb::cast_error &err) { + operands.push_back(getOpResultOrValue(it.value())); + } catch (nb::builtin_exception &err) { throw nb::value_error((llvm::Twine("Operand ") + llvm::Twine(it.index()) + " of operation \"" + name + "\" must be a Value (" + err.what() + ")") @@ -1818,29 +1975,31 @@ nb::object PyOpView::buildGeneric( int segmentSpec = std::get<1>(it.value()); if (segmentSpec == 1 || segmentSpec == 0) { // Unpack unary element. - try { - auto *operandValue = nb::cast(std::get<0>(it.value())); - if (operandValue) { - operands.push_back(operandValue); - operandSegmentLengths.push_back(1); - } else if (segmentSpec == 0) { - // Allowed to be optional. - operandSegmentLengths.push_back(0); - } else { - throw nb::value_error( - (llvm::Twine("Operand ") + llvm::Twine(it.index()) + - " of operation \"" + name + - "\" must be a Value (was None and operand is not optional)") - .str() - .c_str()); + auto &operand = std::get<0>(it.value()); + if (!operand.is_none()) { + try { + + operands.push_back(getOpResultOrValue(operand)); + } catch (nb::builtin_exception &err) { + throw nb::value_error((llvm::Twine("Operand ") + + llvm::Twine(it.index()) + + " of operation \"" + name + + "\" must be a Value (" + err.what() + ")") + .str() + .c_str()); } - } catch (nb::cast_error &err) { - throw nb::value_error((llvm::Twine("Operand ") + - llvm::Twine(it.index()) + " of operation \"" + - name + "\" must be a Value (" + err.what() + - ")") - .str() - .c_str()); + + operandSegmentLengths.push_back(1); + } else if (segmentSpec == 0) { + // Allowed to be optional. + operandSegmentLengths.push_back(0); + } else { + throw nb::value_error( + (llvm::Twine("Operand ") + llvm::Twine(it.index()) + + " of operation \"" + name + + "\" must be a Value (was None and operand is not optional)") + .str() + .c_str()); } } else if (segmentSpec == -1) { // Unpack sequence by appending. @@ -1852,10 +2011,7 @@ nb::object PyOpView::buildGeneric( // Unpack the list. auto segment = nb::cast(std::get<0>(it.value())); for (nb::handle segmentItem : segment) { - operands.push_back(nb::cast(segmentItem)); - if (!operands.back()) { - throw nb::type_error("contained a None item"); - } + operands.push_back(getOpResultOrValue(segmentItem)); } operandSegmentLengths.push_back(nb::len(segment)); } @@ -2269,57 +2425,6 @@ void PySymbolTable::walkSymbolTables(PyOperationBase &from, } namespace { -/// CRTP base class for Python MLIR values that subclass Value and should be -/// castable from it. The value hierarchy is one level deep and is not supposed -/// to accommodate other levels unless core MLIR changes. -template -class PyConcreteValue : public PyValue { -public: - // Derived classes must define statics for: - // IsAFunctionTy isaFunction - // const char *pyClassName - // and redefine bindDerived. - using ClassTy = nb::class_; - using IsAFunctionTy = bool (*)(MlirValue); - - PyConcreteValue() = default; - PyConcreteValue(PyOperationRef operationRef, MlirValue value) - : PyValue(operationRef, value) {} - PyConcreteValue(PyValue &orig) - : PyConcreteValue(orig.getParentOperation(), castFrom(orig)) {} - - /// Attempts to cast the original value to the derived type and throws on - /// type mismatches. - static MlirValue castFrom(PyValue &orig) { - if (!DerivedTy::isaFunction(orig.get())) { - auto origRepr = nb::cast(nb::repr(nb::cast(orig))); - throw nb::value_error((Twine("Cannot cast value to ") + - DerivedTy::pyClassName + " (from " + origRepr + - ")") - .str() - .c_str()); - } - return orig.get(); - } - - /// Binds the Python module objects to functions of this class. - static void bind(nb::module_ &m) { - auto cls = ClassTy(m, DerivedTy::pyClassName); - cls.def(nb::init(), nb::keep_alive<0, 1>(), nb::arg("value")); - cls.def_static( - "isinstance", - [](PyValue &otherValue) -> bool { - return DerivedTy::isaFunction(otherValue); - }, - nb::arg("other_value")); - cls.def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, - [](DerivedTy &self) { return self.maybeDownCast(); }); - DerivedTy::bindDerived(cls); - } - - /// Implemented by derived classes to add methods to the Python subclass. - static void bindDerived(ClassTy &m) {} -}; /// Python wrapper for MlirBlockArgument. class PyBlockArgument : public PyConcreteValue { @@ -2345,39 +2450,6 @@ class PyBlockArgument : public PyConcreteValue { } }; -/// Python wrapper for MlirOpResult. -class PyOpResult : public PyConcreteValue { -public: - static constexpr IsAFunctionTy isaFunction = mlirValueIsAOpResult; - static constexpr const char *pyClassName = "OpResult"; - using PyConcreteValue::PyConcreteValue; - - static void bindDerived(ClassTy &c) { - c.def_prop_ro("owner", [](PyOpResult &self) { - assert( - mlirOperationEqual(self.getParentOperation()->get(), - mlirOpResultGetOwner(self.get())) && - "expected the owner of the value in Python to match that in the IR"); - return self.getParentOperation().getObject(); - }); - c.def_prop_ro("result_number", [](PyOpResult &self) { - return mlirOpResultGetResultNumber(self.get()); - }); - } -}; - -/// Returns the list of types of the values held by container. -template -static std::vector getValueTypes(Container &container, - PyMlirContextRef &context) { - std::vector result; - result.reserve(container.size()); - for (int i = 0, e = container.size(); i < e; ++i) { - result.push_back(mlirValueGetType(container.getElement(i).get())); - } - return result; -} - /// A list of block arguments. Internally, these are stored as consecutive /// elements, random access is cheap. The argument list is associated with the /// operation that contains the block (detached blocks are not allowed in @@ -2484,53 +2556,6 @@ class PyOpOperandList : public Sliceable { PyOperationRef operation; }; -/// A list of operation results. Internally, these are stored as consecutive -/// elements, random access is cheap. The (returned) result list is associated -/// with the operation whose results these are, and thus extends the lifetime of -/// this operation. -class PyOpResultList : public Sliceable { -public: - static constexpr const char *pyClassName = "OpResultList"; - using SliceableT = Sliceable; - - PyOpResultList(PyOperationRef operation, intptr_t startIndex = 0, - intptr_t length = -1, intptr_t step = 1) - : Sliceable(startIndex, - length == -1 ? mlirOperationGetNumResults(operation->get()) - : length, - step), - operation(std::move(operation)) {} - - static void bindDerived(ClassTy &c) { - c.def_prop_ro("types", [](PyOpResultList &self) { - return getValueTypes(self, self.operation->getContext()); - }); - c.def_prop_ro("owner", [](PyOpResultList &self) { - return self.operation->createOpView(); - }); - } - -private: - /// Give the parent CRTP class access to hook implementations below. - friend class Sliceable; - - intptr_t getRawNumElements() { - operation->checkValid(); - return mlirOperationGetNumResults(operation->get()); - } - - PyOpResult getRawElement(intptr_t index) { - PyValue value(operation, mlirOperationGetResult(operation->get(), index)); - return PyOpResult(value); - } - - PyOpResultList slice(intptr_t startIndex, intptr_t length, intptr_t step) { - return PyOpResultList(operation, startIndex, length, step); - } - - PyOperationRef operation; -}; - /// A list of operation successors. Internally, these are stored as consecutive /// elements, random access is cheap. The (returned) successor list is /// associated with the operation whose successors these are, and thus extends @@ -3123,20 +3148,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { "result", [](PyOperationBase &self) { auto &operation = self.getOperation(); - auto numResults = mlirOperationGetNumResults(operation); - if (numResults != 1) { - auto name = mlirIdentifierStr(mlirOperationGetName(operation)); - throw nb::value_error( - (Twine("Cannot call .result on operation ") + - StringRef(name.data, name.length) + " which has " + - Twine(numResults) + - " results (it is only valid for operations with a " - "single result)") - .str() - .c_str()); - } - return PyOpResult(operation.getRef(), - mlirOperationGetResult(operation, 0)) + return PyOpResult(operation.getRef(), getUniqueResult(operation)) .maybeDownCast(); }, "Shortcut to get an op result if it has only one (throws an error " @@ -3233,14 +3245,36 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("walk_order") = MlirWalkPostOrder); nb::class_(m, "Operation") - .def_static("create", &PyOperation::create, nb::arg("name"), - nb::arg("results").none() = nb::none(), - nb::arg("operands").none() = nb::none(), - nb::arg("attributes").none() = nb::none(), - nb::arg("successors").none() = nb::none(), - nb::arg("regions") = 0, nb::arg("loc").none() = nb::none(), - nb::arg("ip").none() = nb::none(), - nb::arg("infer_type") = false, kOperationCreateDocstring) + .def_static( + "create", + [](std::string_view name, + std::optional> results, + std::optional> operands, + std::optional attributes, + std::optional> successors, int regions, + DefaultingPyLocation location, const nb::object &maybeIp, + bool inferType) { + // Unpack/validate operands. + llvm::SmallVector mlirOperands; + if (operands) { + mlirOperands.reserve(operands->size()); + for (PyValue *operand : *operands) { + if (!operand) + throw nb::value_error("operand value cannot be None"); + mlirOperands.push_back(operand->get()); + } + } + + return PyOperation::create(name, results, mlirOperands, attributes, + successors, regions, location, maybeIp, + inferType); + }, + nb::arg("name"), nb::arg("results").none() = nb::none(), + nb::arg("operands").none() = nb::none(), + nb::arg("attributes").none() = nb::none(), + nb::arg("successors").none() = nb::none(), nb::arg("regions") = 0, + nb::arg("loc").none() = nb::none(), nb::arg("ip").none() = nb::none(), + nb::arg("infer_type") = false, kOperationCreateDocstring) .def_static( "parse", [](const std::string &sourceStr, const std::string &sourceName, diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index fd70ac7ac6ec3..dd6e7ef912374 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -686,7 +686,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject { /// Creates an operation. See corresponding python docstring. static nanobind::object create(std::string_view name, std::optional> results, - std::optional> operands, + llvm::ArrayRef operands, std::optional attributes, std::optional> successors, int regions, DefaultingPyLocation location, const nanobind::object &ip, diff --git a/mlir/python/mlir/dialects/_ods_common.py b/mlir/python/mlir/dialects/_ods_common.py index 5b67ab03d6f49..d3dbdc604ef4c 100644 --- a/mlir/python/mlir/dialects/_ods_common.py +++ b/mlir/python/mlir/dialects/_ods_common.py @@ -115,7 +115,10 @@ def get_op_results_or_values( _cext.ir.Operation, _Sequence[_Union[_cext.ir.OpView, _cext.ir.Operation, _cext.ir.Value]], ] -) -> _Union[_Sequence[_cext.ir.Value], _cext.ir.OpResultList]: +) -> _Union[ + _Sequence[_Union[_cext.ir.OpView, _cext.ir.Operation, _cext.ir.Value]], + _cext.ir.OpResultList, +]: """Returns the given sequence of values or the results of the given op. This is useful to implement op constructors so that they can take other ops as @@ -127,7 +130,7 @@ def get_op_results_or_values( elif isinstance(arg, _cext.ir.Operation): return arg.results else: - return [get_op_result_or_value(element) for element in arg] + return arg def get_op_result_or_op_results( diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td index 25833779c2f71..72963cac64d54 100644 --- a/mlir/test/mlir-tblgen/op-python-bindings.td +++ b/mlir/test/mlir-tblgen/op-python-bindings.td @@ -27,8 +27,8 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands", // CHECK: attributes = {} // CHECK: regions = None // CHECK: operands.append(_get_op_results_or_values(variadic1)) - // CHECK: operands.append(_get_op_result_or_value(non_variadic)) - // CHECK: operands.append(_get_op_result_or_value(variadic2) if variadic2 is not None else None) + // CHECK: operands.append(non_variadic) + // CHECK: operands.append(variadic2) // CHECK: _ods_successors = None // CHECK: super().__init__( // CHECK: self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS, @@ -173,8 +173,8 @@ def AttributedOpWithOperands : TestOp<"attributed_op_with_operands"> { // CHECK: results = [] // CHECK: attributes = {} // CHECK: regions = None - // CHECK: operands.append(_get_op_result_or_value(_gen_arg_0)) - // CHECK: operands.append(_get_op_result_or_value(_gen_arg_2)) + // CHECK: operands.append(_gen_arg_0) + // CHECK: operands.append(_gen_arg_2) // CHECK: if bool(in_): attributes["in"] = _ods_ir.UnitAttr.get( // CHECK: _ods_get_default_loc_context(loc)) // CHECK: if is_ is not None: attributes["is"] = (is_ @@ -307,9 +307,9 @@ def MissingNamesOp : TestOp<"missing_names"> { // CHECK: results = [] // CHECK: attributes = {} // CHECK: regions = None - // CHECK: operands.append(_get_op_result_or_value(_gen_arg_0)) - // CHECK: operands.append(_get_op_result_or_value(f32)) - // CHECK: operands.append(_get_op_result_or_value(_gen_arg_2)) + // CHECK: operands.append(_gen_arg_0) + // CHECK: operands.append(f32) + // CHECK: operands.append(_gen_arg_2) // CHECK: results.append(i32) // CHECK: results.append(_gen_res_1) // CHECK: results.append(i64) @@ -349,8 +349,8 @@ def OneOptionalOperandOp : TestOp<"one_optional_operand"> { // CHECK: results = [] // CHECK: attributes = {} // CHECK: regions = None - // CHECK: operands.append(_get_op_result_or_value(non_optional)) - // CHECK: if optional is not None: operands.append(_get_op_result_or_value(optional)) + // CHECK: operands.append(non_optional) + // CHECK: if optional is not None: operands.append(optional) // CHECK: _ods_successors = None // CHECK: super().__init__( // CHECK: self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS @@ -380,7 +380,7 @@ def OneVariadicOperandOp : TestOp<"one_variadic_operand"> { // CHECK: results = [] // CHECK: attributes = {} // CHECK: regions = None - // CHECK: operands.append(_get_op_result_or_value(non_variadic)) + // CHECK: operands.append(non_variadic) // CHECK: operands.extend(_get_op_results_or_values(variadic)) // CHECK: _ods_successors = None // CHECK: super().__init__( @@ -445,7 +445,7 @@ def PythonKeywordOp : TestOp<"python_keyword"> { // CHECK: results = [] // CHECK: attributes = {} // CHECK: regions = None - // CHECK: operands.append(_get_op_result_or_value(in_)) + // CHECK: operands.append(in_) // CHECK: _ods_successors = None // CHECK: super().__init__( // CHECK: self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS @@ -547,8 +547,8 @@ def SimpleOp : TestOp<"simple"> { // CHECK: results = [] // CHECK: attributes = {} // CHECK: regions = None - // CHECK: operands.append(_get_op_result_or_value(i32)) - // CHECK: operands.append(_get_op_result_or_value(f32)) + // CHECK: operands.append(i32) + // CHECK: operands.append(f32) // CHECK: results.append(i64) // CHECK: results.append(f64) // CHECK: _ods_successors = None diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp index e1540d1750ff1..604d2376052a8 100644 --- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp +++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp @@ -37,7 +37,6 @@ from ._ods_common import ( equally_sized_accessor as _ods_equally_sized_accessor, get_default_loc_context as _ods_get_default_loc_context, get_op_result_or_op_results as _get_op_result_or_op_results, - get_op_result_or_value as _get_op_result_or_value, get_op_results_or_values as _get_op_results_or_values, segmented_accessor as _ods_segmented_accessor, ) @@ -501,17 +500,15 @@ constexpr const char *initTemplate = R"Py( /// Template for appending a single element to the operand/result list. /// {0} is the field name. -constexpr const char *singleOperandAppendTemplate = - "operands.append(_get_op_result_or_value({0}))"; +constexpr const char *singleOperandAppendTemplate = "operands.append({0})"; constexpr const char *singleResultAppendTemplate = "results.append({0})"; /// Template for appending an optional element to the operand/result list. /// {0} is the field name. constexpr const char *optionalAppendOperandTemplate = - "if {0} is not None: operands.append(_get_op_result_or_value({0}))"; + "if {0} is not None: operands.append({0})"; constexpr const char *optionalAppendAttrSizedOperandsTemplate = - "operands.append(_get_op_result_or_value({0}) if {0} is not None else " - "None)"; + "operands.append({0})"; constexpr const char *optionalAppendResultTemplate = "if {0} is not None: results.append({0})"; From f2b253b9613a858ae3dd5bf5ccbba87b64941688 Mon Sep 17 00:00:00 2001 From: Emma Pilkington Date: Fri, 24 Jan 2025 09:28:27 -0500 Subject: [PATCH 009/432] [SelectionDAG] Fix an incorrect DebugLoc on a COPY (#122963) Fixes: SWDEV-502134 --- .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 5 +- llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll | 70 +++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 8e313fb21eede..333ec5e98b2bc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -351,8 +351,9 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB, OpRC = TRI->getAllocatableClass(OpRC); assert(OpRC && "Constraints cannot be fulfilled for allocation"); Register NewVReg = MRI->createVirtualRegister(OpRC); - BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(), - TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg); + BuildMI(*MBB, InsertPos, MIB->getDebugLoc(), + TII->get(TargetOpcode::COPY), NewVReg) + .addReg(VReg); VReg = NewVReg; } else { assert(ConstrainedRC->isAllocatable() && diff --git a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll new file mode 100644 index 0000000000000..8b54f709eec7a --- /dev/null +++ b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s + +; Verify that the debug locations in this function are correct, in particular +; that the location for %cast doesn't appear in the block of %lab. + +define void @_Z12lane_pc_testj() #0 !dbg !9 { +; GCN-LABEL: _Z12lane_pc_testj: +; GCN: .Lfunc_begin0: +; GCN-NEXT: .file 0 "/" "t.cpp" +; GCN-NEXT: .loc 0 3 0 ; t.cpp:3:0 +; GCN-NEXT: .cfi_sections .debug_frame +; GCN-NEXT: .cfi_startproc +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ; %bb.1: ; %lab +; GCN-NEXT: .Ltmp0: +; GCN-NEXT: .loc 0 12 1 prologue_end ; t.cpp:12:1 +; GCN-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN-NEXT: s_mov_b32 s6, 32 +; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_mov_b32 s5, -1 +; GCN-NEXT: s_lshr_b32 s8, s32, 5 +; GCN-NEXT: s_cmp_lg_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s5, s4, s7 +; GCN-NEXT: s_cselect_b32 s4, s8, s6 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: .loc 0 13 1 ; t.cpp:13:1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: .loc 0 14 1 ; t.cpp:14:1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .Ltmp1: + %alloc = alloca i32, align 4, addrspace(5) + %cast = addrspacecast ptr addrspace(5) %alloc to ptr, !dbg !12 + br label %lab + +lab: + store i32 0, ptr %cast, align 4, !dbg !13 + store i32 1, ptr %cast, align 4, !dbg !14 + ret void +} + +attributes #0 = { noinline optnone } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "t.cpp", directory: "/") +!2 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!3 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +!4 = !{i32 7, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = distinct !DISubprogram(name: "lane_pc_test", linkageName: "_Z12lane_pc_testj", scope: !1, file: !1, line: 1, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, type: !10, unit: !0) +!10 = !DISubroutineType(types: !11) +!11 = !{} +!12 = !DILocation(line: 12, column: 1, scope: !9) +!13 = !DILocation(line: 13, column: 1, scope: !9) +!14 = !DILocation(line: 14, column: 1, scope: !9) From 2068b1ba031e258a6448bea372005d19692c802a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 24 Jan 2025 15:31:53 +0100 Subject: [PATCH 010/432] [X86] Fix ABI for passing after i128 (#124134) If we're passing an i128 value and we no longer have enough argument registers (only r9 unallocated), the value gets passed via the stack. However, r9 is still allocated as a shadow register, which means that a following i64 argument will not use it. This doesn't match the x86-64 psABI. Fix this by making i128 arguments as requiring consecutive registers, and then adding a custom CC lowering that will allocate both parts of the i128 at the same time, either to register or to stack, without reserving a shadow register. Fixes https://github.com/llvm/llvm-project/issues/123935. --- llvm/lib/Target/X86/X86CallingConv.cpp | 34 ++++++++++++ llvm/lib/Target/X86/X86CallingConv.td | 6 +-- llvm/lib/Target/X86/X86ISelLowering.h | 4 ++ llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 8 +++ llvm/test/CodeGen/X86/addcarry.ll | 2 +- .../CodeGen/X86/apx/flags-copy-lowering.ll | 8 +-- llvm/test/CodeGen/X86/avgflooru-i128.ll | 2 +- llvm/test/CodeGen/X86/fmuladd-soft-float.ll | 54 +++++++++---------- llvm/test/CodeGen/X86/i128-abi.ll | 10 ++-- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 36 ++++++------- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 36 ++++++------- llvm/test/CodeGen/X86/subcarry.ll | 2 +- llvm/test/CodeGen/X86/uadd_sat_vec.ll | 8 +-- llvm/test/CodeGen/X86/usub_sat_vec.ll | 8 +-- 14 files changed, 130 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 7359ef341dde5..0b4c63f7a81f7 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -340,5 +340,39 @@ static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return false; } +/// Special handling for i128: Either allocate the value to two consecutive +/// i64 registers, or to the stack. Do not partially allocate in registers, +/// and do not reserve any registers when allocating to the stack. +static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i64 && "Should have i64 parts"); + SmallVectorImpl &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + unsigned NumRegs = PendingMembers.size(); + assert(NumRegs == 2 && "Should have two parts"); + + static const MCPhysReg Regs[] = {X86::RDI, X86::RSI, X86::RDX, + X86::RCX, X86::R8, X86::R9}; + ArrayRef Allocated = State.AllocateRegBlock(Regs, NumRegs); + if (!Allocated.empty()) { + PendingMembers[0].convertToReg(Allocated[0]); + PendingMembers[1].convertToReg(Allocated[1]); + } else { + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 8); + } + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 91af111db8cda..72b103b0bb0c5 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -548,11 +548,9 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, // i128 can be either passed in two i64 registers, or on the stack, but - // not split across register and stack. As such, do not allow using R9 - // for a split i64. + // not split across register and stack. Handle this with a custom function. CCIfType<[i64], - CCIfSplit>>, - CCIfType<[i64], CCIfSplit>>, + CCIfConsecutiveRegs>>, CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e07bcd989c518..fe79fefeed631 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1604,6 +1604,10 @@ namespace llvm { LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override; + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg, + const DataLayout &DL) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool supportSwiftError() const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 4a4fd246cb7cd..6835c7e336a5c 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -233,6 +233,14 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, return VT.changeVectorElementTypeToInteger(); } +bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg, + const DataLayout &DL) const { + // i128 split into i64 needs to be allocated to two consecutive registers, + // or spilled to the stack as a whole. + return Ty->isIntegerTy(128); +} + /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll index f8d32fc2d2925..97894db1188e2 100644 --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -49,7 +49,7 @@ define i256 @add256(i256 %a, i256 %b) nounwind { ; CHECK-LABEL: add256: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: addq %r9, %rsi ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll index deca130a04ff0..bd764c2edef29 100644 --- a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll +++ b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll @@ -31,15 +31,15 @@ define <2 x i128> @flag_copy_2(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-NEXT: movq %r8, %rdi ; CHECK-NEXT: {nf} sarq $63, %rdi ; CHECK-NEXT: cmovoq %rdi, %rcx -; CHECK-NEXT: movabsq $-9223372036854775808, %r9 # imm = 0x8000000000000000 -; CHECK-NEXT: {nf} xorq %r9, %rdi +; CHECK-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000 +; CHECK-NEXT: {nf} xorq %r10, %rdi ; CHECK-NEXT: cmovnoq %r8, %rdi -; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: subq %r9, %rsi ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: movq %rdx, %r8 ; CHECK-NEXT: {nf} sarq $63, %r8 ; CHECK-NEXT: cmovoq %r8, %rsi -; CHECK-NEXT: {nf} xorq %r9, %r8 +; CHECK-NEXT: {nf} xorq %r10, %r8 ; CHECK-NEXT: cmovnoq %rdx, %r8 ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq %rsi, (%rax) diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll index da16a7da48ca6..11e886e25ba4e 100644 --- a/llvm/test/CodeGen/X86/avgflooru-i128.ll +++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll @@ -119,7 +119,7 @@ define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: addq %r9, %rsi ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: setb %dil ; CHECK-NEXT: movzbl %dil, %edi diff --git a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll index ccb2f37590b0a..cbdfa32ed4627 100644 --- a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll +++ b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll @@ -1555,30 +1555,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 ; SOFT-FLOAT-64-NEXT: .cfi_offset %r14, -32 ; SOFT-FLOAT-64-NEXT: .cfi_offset %r15, -24 ; SOFT-FLOAT-64-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-NEXT: movq %r9, %rbp ; SOFT-FLOAT-64-NEXT: movq %rcx, %r14 ; SOFT-FLOAT-64-NEXT: movq %rdx, %r15 -; SOFT-FLOAT-64-NEXT: movq %rsi, %r12 +; SOFT-FLOAT-64-NEXT: movq %rsi, %r13 ; SOFT-FLOAT-64-NEXT: movq %rdi, %rbx -; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: movq %r8, %rdi ; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT -; SOFT-FLOAT-64-NEXT: movq %rax, %r13 +; SOFT-FLOAT-64-NEXT: movq %rax, %r12 ; SOFT-FLOAT-64-NEXT: movq %r14, %rdi -; SOFT-FLOAT-64-NEXT: movq %rbp, %rsi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-NEXT: movq %rax, %r14 ; SOFT-FLOAT-64-NEXT: movq %r15, %rdi ; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-NEXT: movq %rax, %r15 -; SOFT-FLOAT-64-NEXT: movq %r12, %rdi -; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-NEXT: movq %rbp, %rsi ; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-NEXT: movq %rax, %rdi ; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT -; SOFT-FLOAT-64-NEXT: movq %rax, %r12 +; SOFT-FLOAT-64-NEXT: movq %rax, %r13 ; SOFT-FLOAT-64-NEXT: movq %r15, %rdi ; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT @@ -1587,13 +1587,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 ; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT ; SOFT-FLOAT-64-NEXT: movq %rax, %r14 -; SOFT-FLOAT-64-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-NEXT: movq %r12, %rdi ; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT ; SOFT-FLOAT-64-NEXT: movq %rax, 24(%rbx) ; SOFT-FLOAT-64-NEXT: movq %r14, 16(%rbx) ; SOFT-FLOAT-64-NEXT: movq %r15, 8(%rbx) -; SOFT-FLOAT-64-NEXT: movq %r12, (%rbx) +; SOFT-FLOAT-64-NEXT: movq %r13, (%rbx) ; SOFT-FLOAT-64-NEXT: movq %rbx, %rax ; SOFT-FLOAT-64-NEXT: addq $8, %rsp ; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 56 @@ -1633,30 +1633,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 ; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r14, -32 ; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r15, -24 ; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-FMA-NEXT: movq %r9, %rbp ; SOFT-FLOAT-64-FMA-NEXT: movq %rcx, %r14 ; SOFT-FLOAT-64-FMA-NEXT: movq %rdx, %r15 -; SOFT-FLOAT-64-FMA-NEXT: movq %rsi, %r12 +; SOFT-FLOAT-64-FMA-NEXT: movq %rsi, %r13 ; SOFT-FLOAT-64-FMA-NEXT: movq %rdi, %rbx -; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: movq %r8, %rdi ; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT -; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r13 +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r12 ; SOFT-FLOAT-64-FMA-NEXT: movq %r14, %rdi -; SOFT-FLOAT-64-FMA-NEXT: movq %rbp, %rsi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r14 ; SOFT-FLOAT-64-FMA-NEXT: movq %r15, %rdi ; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r15 -; SOFT-FLOAT-64-FMA-NEXT: movq %r12, %rdi -; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq %rbp, %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %rdi ; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT -; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r12 +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r13 ; SOFT-FLOAT-64-FMA-NEXT: movq %r15, %rdi ; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT @@ -1665,13 +1665,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 ; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT ; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r14 -; SOFT-FLOAT-64-FMA-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq %r12, %rdi ; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT ; SOFT-FLOAT-64-FMA-NEXT: movq %rax, 24(%rbx) ; SOFT-FLOAT-64-FMA-NEXT: movq %r14, 16(%rbx) ; SOFT-FLOAT-64-FMA-NEXT: movq %r15, 8(%rbx) -; SOFT-FLOAT-64-FMA-NEXT: movq %r12, (%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movq %r13, (%rbx) ; SOFT-FLOAT-64-FMA-NEXT: movq %rbx, %rax ; SOFT-FLOAT-64-FMA-NEXT: addq $8, %rsp ; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 56 @@ -1711,30 +1711,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 ; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r14, -32 ; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r15, -24 ; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r9, %rbp ; SOFT-FLOAT-64-FMA4-NEXT: movq %rcx, %r14 ; SOFT-FLOAT-64-FMA4-NEXT: movq %rdx, %r15 -; SOFT-FLOAT-64-FMA4-NEXT: movq %rsi, %r12 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rsi, %r13 ; SOFT-FLOAT-64-FMA4-NEXT: movq %rdi, %rbx -; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: movq %r8, %rdi ; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT -; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r13 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r12 ; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, %rdi -; SOFT-FLOAT-64-FMA4-NEXT: movq %rbp, %rsi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r14 ; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, %rdi ; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r15 -; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, %rdi -; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq %rbp, %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT ; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %rdi ; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT -; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r12 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r13 ; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, %rdi ; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT @@ -1743,13 +1743,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 ; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT ; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r14 -; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, %rdi ; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT ; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, 24(%rbx) ; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, 16(%rbx) ; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, 8(%rbx) -; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, (%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, (%rbx) ; SOFT-FLOAT-64-FMA4-NEXT: movq %rbx, %rax ; SOFT-FLOAT-64-FMA4-NEXT: addq $8, %rsp ; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 56 diff --git a/llvm/test/CodeGen/X86/i128-abi.ll b/llvm/test/CodeGen/X86/i128-abi.ll index 23eb6ec0322ab..264c546b4cae2 100644 --- a/llvm/test/CodeGen/X86/i128-abi.ll +++ b/llvm/test/CodeGen/X86/i128-abi.ll @@ -31,7 +31,7 @@ define i128 @on_stack2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i1 define i64 @trailing_arg_on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i64 %a6) { ; CHECK-LABEL: trailing_arg_on_stack: ; CHECK: # %bb.0: -; CHECK-NEXT: movq 24(%rsp), %rax +; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: retq ret i64 %a6 } @@ -78,20 +78,18 @@ define void @call_trailing_arg_on_stack(i128 %x, i64 %y) nounwind { ; CHECK-LABEL: call_trailing_arg_on_stack: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: movq %rsi, %r9 +; CHECK-NEXT: movq %rdx, %r9 +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: movq %rdi, %r10 -; CHECK-NEXT: subq $8, %rsp ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: movl $2, %edx ; CHECK-NEXT: movl $3, %ecx ; CHECK-NEXT: movl $4, %r8d ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: pushq %r9 ; CHECK-NEXT: pushq %r10 ; CHECK-NEXT: callq trailing_arg_on_stack@PLT -; CHECK-NEXT: addq $32, %rsp +; CHECK-NEXT: addq $16, %rsp ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq call i128 @trailing_arg_on_stack(i64 0, i64 1, i64 2, i64 3, i64 4, i128 %x, i64 %y) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 322acd76e12e6..bd563f97b0ac4 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1795,27 +1795,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8 ; SSE-NEXT: seto %dil -; SSE-NEXT: movq %r8, %r9 -; SSE-NEXT: sarq $63, %r9 +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: sarq $63, %r10 ; SSE-NEXT: testb %dil, %dil -; SSE-NEXT: cmovneq %r9, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000 -; SSE-NEXT: xorq %r10, %r9 +; SSE-NEXT: cmovneq %r10, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; SSE-NEXT: xorq %r11, %r10 ; SSE-NEXT: testb %dil, %dil -; SSE-NEXT: cmoveq %r8, %r9 -; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: cmoveq %r8, %r10 +; SSE-NEXT: addq %r9, %rsi ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: seto %dil ; SSE-NEXT: movq %rdx, %r8 ; SSE-NEXT: sarq $63, %r8 ; SSE-NEXT: testb %dil, %dil ; SSE-NEXT: cmovneq %r8, %rsi -; SSE-NEXT: xorq %r10, %r8 +; SSE-NEXT: xorq %r11, %r8 ; SSE-NEXT: testb %dil, %dil ; SSE-NEXT: cmoveq %rdx, %r8 ; SSE-NEXT: movq %rcx, 16(%rax) ; SSE-NEXT: movq %rsi, (%rax) -; SSE-NEXT: movq %r9, 24(%rax) +; SSE-NEXT: movq %r10, 24(%rax) ; SSE-NEXT: movq %r8, 8(%rax) ; SSE-NEXT: retq ; @@ -1825,27 +1825,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8 ; AVX-NEXT: seto %dil -; AVX-NEXT: movq %r8, %r9 -; AVX-NEXT: sarq $63, %r9 +; AVX-NEXT: movq %r8, %r10 +; AVX-NEXT: sarq $63, %r10 ; AVX-NEXT: testb %dil, %dil -; AVX-NEXT: cmovneq %r9, %rcx -; AVX-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000 -; AVX-NEXT: xorq %r10, %r9 +; AVX-NEXT: cmovneq %r10, %rcx +; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; AVX-NEXT: xorq %r11, %r10 ; AVX-NEXT: testb %dil, %dil -; AVX-NEXT: cmoveq %r8, %r9 -; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: cmoveq %r8, %r10 +; AVX-NEXT: addq %r9, %rsi ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: seto %dil ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: sarq $63, %r8 ; AVX-NEXT: testb %dil, %dil ; AVX-NEXT: cmovneq %r8, %rsi -; AVX-NEXT: xorq %r10, %r8 +; AVX-NEXT: xorq %r11, %r8 ; AVX-NEXT: testb %dil, %dil ; AVX-NEXT: cmoveq %rdx, %r8 ; AVX-NEXT: movq %rcx, 16(%rax) ; AVX-NEXT: movq %rsi, (%rax) -; AVX-NEXT: movq %r9, 24(%rax) +; AVX-NEXT: movq %r10, 24(%rax) ; AVX-NEXT: movq %r8, 8(%rax) ; AVX-NEXT: retq %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index ac8b561abf003..88df3c175ec9c 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -2026,27 +2026,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 ; SSE-NEXT: seto %dil -; SSE-NEXT: movq %r8, %r9 -; SSE-NEXT: sarq $63, %r9 +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: sarq $63, %r10 ; SSE-NEXT: testb %dil, %dil -; SSE-NEXT: cmovneq %r9, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000 -; SSE-NEXT: xorq %r10, %r9 +; SSE-NEXT: cmovneq %r10, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; SSE-NEXT: xorq %r11, %r10 ; SSE-NEXT: testb %dil, %dil -; SSE-NEXT: cmoveq %r8, %r9 -; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: cmoveq %r8, %r10 +; SSE-NEXT: subq %r9, %rsi ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: seto %dil ; SSE-NEXT: movq %rdx, %r8 ; SSE-NEXT: sarq $63, %r8 ; SSE-NEXT: testb %dil, %dil ; SSE-NEXT: cmovneq %r8, %rsi -; SSE-NEXT: xorq %r10, %r8 +; SSE-NEXT: xorq %r11, %r8 ; SSE-NEXT: testb %dil, %dil ; SSE-NEXT: cmoveq %rdx, %r8 ; SSE-NEXT: movq %rcx, 16(%rax) ; SSE-NEXT: movq %rsi, (%rax) -; SSE-NEXT: movq %r9, 24(%rax) +; SSE-NEXT: movq %r10, 24(%rax) ; SSE-NEXT: movq %r8, 8(%rax) ; SSE-NEXT: retq ; @@ -2056,27 +2056,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 ; AVX-NEXT: seto %dil -; AVX-NEXT: movq %r8, %r9 -; AVX-NEXT: sarq $63, %r9 +; AVX-NEXT: movq %r8, %r10 +; AVX-NEXT: sarq $63, %r10 ; AVX-NEXT: testb %dil, %dil -; AVX-NEXT: cmovneq %r9, %rcx -; AVX-NEXT: movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000 -; AVX-NEXT: xorq %r10, %r9 +; AVX-NEXT: cmovneq %r10, %rcx +; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 +; AVX-NEXT: xorq %r11, %r10 ; AVX-NEXT: testb %dil, %dil -; AVX-NEXT: cmoveq %r8, %r9 -; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: cmoveq %r8, %r10 +; AVX-NEXT: subq %r9, %rsi ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: seto %dil ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: sarq $63, %r8 ; AVX-NEXT: testb %dil, %dil ; AVX-NEXT: cmovneq %r8, %rsi -; AVX-NEXT: xorq %r10, %r8 +; AVX-NEXT: xorq %r11, %r8 ; AVX-NEXT: testb %dil, %dil ; AVX-NEXT: cmoveq %rdx, %r8 ; AVX-NEXT: movq %rcx, 16(%rax) ; AVX-NEXT: movq %rsi, (%rax) -; AVX-NEXT: movq %r9, 24(%rax) +; AVX-NEXT: movq %r10, 24(%rax) ; AVX-NEXT: movq %r8, 8(%rax) ; AVX-NEXT: retq %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll index 1e9db9f55a8d5..9538ea1061cd1 100644 --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/llvm/test/CodeGen/X86/subcarry.ll @@ -21,7 +21,7 @@ define i256 @sub256(i256 %a, i256 %b) nounwind { ; CHECK-LABEL: sub256: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: subq %r9, %rsi ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll index 1ff95c876a6b1..d744ce6ed6af0 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -1161,11 +1161,11 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: addq %r9, %rsi ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: movq $-1, %rdi -; SSE-NEXT: cmovbq %rdi, %rdx ; SSE-NEXT: cmovbq %rdi, %rsi +; SSE-NEXT: cmovbq %rdi, %rdx ; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8 ; SSE-NEXT: cmovbq %rdi, %r8 @@ -1179,11 +1179,11 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; AVX-LABEL: v2i128: ; AVX: # %bb.0: ; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: addq %r9, %rsi ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: movq $-1, %rdi -; AVX-NEXT: cmovbq %rdi, %rdx ; AVX-NEXT: cmovbq %rdi, %rsi +; AVX-NEXT: cmovbq %rdi, %rdx ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8 ; AVX-NEXT: cmovbq %rdi, %r8 diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index 34eb30dfebeeb..4e17ca6fbae33 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -1057,10 +1057,10 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: xorl %edi, %edi -; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: subq %r9, %rsi ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: cmovbq %rdi, %rdx ; SSE-NEXT: cmovbq %rdi, %rsi +; SSE-NEXT: cmovbq %rdi, %rdx ; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 ; SSE-NEXT: cmovbq %rdi, %r8 @@ -1075,10 +1075,10 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: xorl %edi, %edi -; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: subq %r9, %rsi ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: cmovbq %rdi, %rdx ; AVX-NEXT: cmovbq %rdi, %rsi +; AVX-NEXT: cmovbq %rdi, %rdx ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 ; AVX-NEXT: cmovbq %rdi, %r8 From 9cf52fe1f94fdcd8e27c76f7d33a80eeb2075833 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Fri, 24 Jan 2025 14:48:47 +0000 Subject: [PATCH 011/432] [flang][OpenMP][NFC] test the current private dealloc runtime calls (#124017) It looks like in most cases we still don't make calls to deallocate allocatable members of derived types which have been privatized. This is just intended to add a test for the one case where we do, to make sure this doesn't regress with my upcoming changes. --- flang/test/Lower/OpenMP/derived-type-allocatable.f90 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable.f90 index 2dc4e20f27af2..1d6e22212eedd 100644 --- a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 +++ b/flang/test/Lower/OpenMP/derived-type-allocatable.f90 @@ -24,6 +24,9 @@ module m1 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_array_of_allocs !CHECK: fir.call @_FortranAInitializeClone !CHECK-NEXT: omp.yield +!CHECK: } dealloc { +!CHECK: fir.call @_FortranAAllocatableDeallocate +!CHECK: omp.yield !CHECK-LABEL: omp.private {type = firstprivate} @_QMm1Ftest_array !CHECK-NOT: fir.call @_FortranAInitializeClone From c546b5317c518987a5f45dd4c4d25321a955c758 Mon Sep 17 00:00:00 2001 From: DianQK Date: Fri, 24 Jan 2025 23:02:50 +0800 Subject: [PATCH 012/432] [ValueTracking] Pass changed predicate `SignedLPred` to `isImpliedByMatchingCmp` (#124271) Fixes #124267. Since we are using the new predicate, we should also update the parameters of `isImpliedByMatchingCmp`. --- llvm/lib/Analysis/ValueTracking.cpp | 4 +- .../implied-condition-samesign.ll | 48 +++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 264fedd6b66b9..eba728c7c8c36 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -9494,7 +9494,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, SignedLPred == ICmpInst::ICMP_SGE) && match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) { if (match(R1, m_NonPositive()) && - ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == false) + ICmpInst::isImpliedByMatchingCmp(SignedLPred, RPred) == false) return false; } @@ -9504,7 +9504,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, SignedLPred == ICmpInst::ICMP_SLE) && match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) { if (match(R1, m_NonNegative()) && - ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == true) + ICmpInst::isImpliedByMatchingCmp(SignedLPred, RPred) == true) return true; } diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll index 0e6db403512ae..9a0591245fae0 100644 --- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll +++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll @@ -292,3 +292,51 @@ taken: end: ret i32 0 } + +define i1 @gt_sub_nsw_ult(i8 %L0, i8 %L1, i1 %V) { +; CHECK-LABEL: define i1 @gt_sub_nsw_ult( +; CHECK-SAME: i8 [[L0:%.*]], i8 [[L1:%.*]], i1 [[V:%.*]]) { +; CHECK-NEXT: [[LHS:%.*]] = icmp samesign ugt i8 [[L0]], [[L1]] +; CHECK-NEXT: br i1 [[LHS]], label %[[LHS_TRUE:.*]], label %[[LHS_FALSE:.*]] +; CHECK: [[LHS_TRUE]]: +; CHECK-NEXT: [[R0:%.*]] = sub nsw i8 [[L0]], [[L1]] +; CHECK-NEXT: [[RHS:%.*]] = icmp ult i8 [[R0]], -1 +; CHECK-NEXT: ret i1 [[RHS]] +; CHECK: [[LHS_FALSE]]: +; CHECK-NEXT: ret i1 [[V]] +; + %LHS = icmp samesign ugt i8 %L0, %L1 + br i1 %LHS, label %LHS_true, label %LHS_false + +LHS_true: + %R0 = sub nsw i8 %L0, %L1 + %RHS = icmp ult i8 %R0, -1 + ret i1 %RHS + +LHS_false: + ret i1 %V +} + +define i1 @lt_sub_nsw_ult(i8 %L0, i8 %L1, i1 %V) { +; CHECK-LABEL: define i1 @lt_sub_nsw_ult( +; CHECK-SAME: i8 [[L0:%.*]], i8 [[L1:%.*]], i1 [[V:%.*]]) { +; CHECK-NEXT: [[LHS:%.*]] = icmp samesign ult i8 [[L0]], [[L1]] +; CHECK-NEXT: br i1 [[LHS]], label %[[LHS_TRUE:.*]], label %[[LHS_FALSE:.*]] +; CHECK: [[LHS_TRUE]]: +; CHECK-NEXT: [[R0:%.*]] = sub nsw i8 [[L0]], [[L1]] +; CHECK-NEXT: [[RHS:%.*]] = icmp ult i8 [[R0]], 1 +; CHECK-NEXT: ret i1 [[RHS]] +; CHECK: [[LHS_FALSE]]: +; CHECK-NEXT: ret i1 [[V]] +; + %LHS = icmp samesign ult i8 %L0, %L1 + br i1 %LHS, label %LHS_true, label %LHS_false + +LHS_true: + %R0 = sub nsw i8 %L0, %L1 + %RHS = icmp ult i8 %R0, 1 + ret i1 %RHS + +LHS_false: + ret i1 %V +} From a12d7e4b611f0db2525da68f5576beaeeb6c84ac Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 24 Jan 2025 15:13:13 +0000 Subject: [PATCH 013/432] [SLP] getVectorCallCosts - don't provide scalar argument data for vector IntrinsicCostAttributes (#124254) getVectorCallCosts determines the cost of a vector intrinsic, based off an existing scalar intrinsic call - but we were including the scalar argument data to the IntrinsicCostAttributes, which meant that not only was the cost calculation not type-only based, it was making incorrect assumptions about constant values etc. This also exposed an issue that x86 relied on fallback calculations for funnel shift costs - this is great when we have the argument data as that improves the accuracy of uniform shift amounts etc., but meant that type-only costs would default to Cost=2 for all custom lowered funnel shifts, which was far too cheap. This is the reverse of #124129 where we weren't including argument data when we could. Fixes #63980 --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 18 + .../Transforms/Vectorize/SLPVectorizer.cpp | 4 +- .../SLPVectorizer/X86/arith-fshl-rot.ll | 445 ++++++++---------- .../SLPVectorizer/X86/arith-fshr-rot.ll | 445 ++++++++---------- 4 files changed, 409 insertions(+), 503 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 34ba46f5e6cfd..d3c923a76d074 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4719,6 +4719,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); + + // Without arg data, we need to compute the expanded costs of custom lowered + // intrinsics to prevent use of the (very low) default costs. + if (ICA.isTypeBasedOnly() && + (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) { + Type *CondTy = RetTy->getWithNewBitWidth(1); + InstructionCost Cost = 0; + Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind); + Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, + CmpInst::ICMP_EQ, CostKind); + Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, + CmpInst::ICMP_EQ, CostKind); + return Cost; + } } return BaseT::getIntrinsicInstrCost(ICA, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 040c57703b7c6..eea6b32460d70 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9031,9 +9031,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, FastMathFlags FMF; if (auto *FPCI = dyn_cast(CI)) FMF = FPCI->getFastMathFlags(); - SmallVector Arguments(CI->args()); - IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF, - dyn_cast(CI)); + IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF); auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll index 153191b1eea08..3b526c4537243 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -240,16 +240,46 @@ define void @fshl_v16i32() { ; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fshl_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) -; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @fshl_v16i32( +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; AVX1-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; AVX1-NEXT: store <4 x i32> [[TMP3]], ptr @d32, align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; AVX1-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; AVX1-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) +; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @fshl_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; AVX256-LABEL: @fshl_v16i32( +; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX256-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX256-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fshl_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 @@ -333,155 +363,136 @@ define void @fshl_v16i32() { } define void @fshl_v32i16() { -; SSE2-LABEL: @fshl_v32i16( -; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) -; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) -; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) -; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) -; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) -; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) -; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) -; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) -; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) -; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) -; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) -; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) -; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) -; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) -; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) -; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) -; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) -; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) -; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) -; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) -; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) -; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) -; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) -; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) -; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) -; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) -; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) -; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) -; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) -; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) -; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) -; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) -; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2 -; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 -; SSE2-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE2-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 -; SSE2-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 -; SSE2-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 -; SSE2-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 -; SSE2-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 -; SSE2-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 -; SSE2-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 -; SSE2-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE2-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 -; SSE2-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 -; SSE2-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 -; SSE2-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 -; SSE2-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 -; SSE2-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 -; SSE2-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 -; SSE2-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE2-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 -; SSE2-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 -; SSE2-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 -; SSE2-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 -; SSE2-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 -; SSE2-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 -; SSE2-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 -; SSE2-NEXT: ret void -; -; SSE4-LABEL: @fshl_v32i16( -; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE4-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE4-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE4-NEXT: store <8 x i16> [[TMP3]], ptr @d16, align 2 -; SSE4-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) -; SSE4-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -; SSE4-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -; SSE4-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE4-NEXT: ret void +; SSE-LABEL: @fshl_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) +; SSE-NEXT: [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) +; SSE-NEXT: [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) +; SSE-NEXT: [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) +; SSE-NEXT: [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) +; SSE-NEXT: [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) +; SSE-NEXT: [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) +; SSE-NEXT: [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) +; SSE-NEXT: [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) +; SSE-NEXT: [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) +; SSE-NEXT: [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) +; SSE-NEXT: [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) +; SSE-NEXT: [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) +; SSE-NEXT: [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) +; SSE-NEXT: [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) +; SSE-NEXT: [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) +; SSE-NEXT: [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) +; SSE-NEXT: [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) +; SSE-NEXT: [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) +; SSE-NEXT: [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) +; SSE-NEXT: [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) +; SSE-NEXT: [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) +; SSE-NEXT: [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) +; SSE-NEXT: [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) +; SSE-NEXT: [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) +; SSE-NEXT: [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) +; SSE-NEXT: [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) +; SSE-NEXT: [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) +; SSE-NEXT: [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) +; SSE-NEXT: [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) +; SSE-NEXT: [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) +; SSE-NEXT: [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) +; SSE-NEXT: store i16 [[R0]], ptr @d16, align 2 +; SSE-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @fshl_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 @@ -944,52 +955,16 @@ define void @fshl_v64i8() { } define void @fshl_v2i32() { -; SSE-LABEL: @fshl_v2i32( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshl_v2i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshl_v2i32( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX2-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshl_v2i32( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX256-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshl_v2i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshl_v2i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshl_v2i32( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 @@ -1011,44 +986,14 @@ define void @fshl_v2i32() { ; PR63980 define void @fshl_v2i32_uniformconst() { -; SSE-LABEL: @fshl_v2i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshl_v2i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshl_v2i32_uniformconst( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshl_v2i32_uniformconst( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX256-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshl_v2i32_uniformconst( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshl_v2i32_uniformconst( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshl_v2i32_uniformconst( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll index 4d50ffad7f8b5..aae540b4b2454 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -240,16 +240,46 @@ define void @fshr_v16i32() { ; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fshr_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) -; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @fshr_v16i32( +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; AVX1-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; AVX1-NEXT: store <4 x i32> [[TMP3]], ptr @d32, align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; AVX1-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; AVX1-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) +; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @fshr_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; AVX256-LABEL: @fshr_v16i32( +; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX256-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX256-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fshr_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 @@ -333,155 +363,136 @@ define void @fshr_v16i32() { } define void @fshr_v32i16() { -; SSE2-LABEL: @fshr_v32i16( -; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) -; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) -; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) -; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) -; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) -; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) -; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) -; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) -; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) -; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) -; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) -; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) -; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) -; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) -; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) -; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) -; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) -; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) -; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) -; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) -; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) -; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) -; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) -; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) -; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) -; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) -; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) -; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) -; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) -; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) -; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) -; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) -; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2 -; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 -; SSE2-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE2-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 -; SSE2-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 -; SSE2-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 -; SSE2-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 -; SSE2-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 -; SSE2-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 -; SSE2-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 -; SSE2-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE2-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 -; SSE2-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 -; SSE2-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 -; SSE2-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 -; SSE2-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 -; SSE2-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 -; SSE2-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 -; SSE2-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE2-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 -; SSE2-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 -; SSE2-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 -; SSE2-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 -; SSE2-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 -; SSE2-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 -; SSE2-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 -; SSE2-NEXT: ret void -; -; SSE4-LABEL: @fshr_v32i16( -; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE4-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE4-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE4-NEXT: store <8 x i16> [[TMP3]], ptr @d16, align 2 -; SSE4-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) -; SSE4-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -; SSE4-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -; SSE4-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE4-NEXT: ret void +; SSE-LABEL: @fshr_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) +; SSE-NEXT: [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) +; SSE-NEXT: [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) +; SSE-NEXT: [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) +; SSE-NEXT: [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) +; SSE-NEXT: [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) +; SSE-NEXT: [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) +; SSE-NEXT: [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) +; SSE-NEXT: [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) +; SSE-NEXT: [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) +; SSE-NEXT: [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) +; SSE-NEXT: [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) +; SSE-NEXT: [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) +; SSE-NEXT: [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) +; SSE-NEXT: [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) +; SSE-NEXT: [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) +; SSE-NEXT: [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) +; SSE-NEXT: [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) +; SSE-NEXT: [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) +; SSE-NEXT: [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) +; SSE-NEXT: [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) +; SSE-NEXT: [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) +; SSE-NEXT: [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) +; SSE-NEXT: [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) +; SSE-NEXT: [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) +; SSE-NEXT: [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) +; SSE-NEXT: [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) +; SSE-NEXT: [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) +; SSE-NEXT: [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) +; SSE-NEXT: [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) +; SSE-NEXT: [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) +; SSE-NEXT: [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) +; SSE-NEXT: store i16 [[R0]], ptr @d16, align 2 +; SSE-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @fshr_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 @@ -944,52 +955,16 @@ define void @fshr_v64i8() { } define void @fshr_v2i32() { -; SSE-LABEL: @fshr_v2i32( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshr_v2i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshr_v2i32( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX2-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshr_v2i32( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX256-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshr_v2i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshr_v2i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshr_v2i32( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 @@ -1011,44 +986,14 @@ define void @fshr_v2i32() { ; PR63980 define void @fshr_v2i32_uniformconst() { -; SSE-LABEL: @fshr_v2i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshr_v2i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshr_v2i32_uniformconst( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshr_v2i32_uniformconst( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX256-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshr_v2i32_uniformconst( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshr_v2i32_uniformconst( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshr_v2i32_uniformconst( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 From 6c11b7e689c89ff46e4472810dd555434eab1010 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 24 Jan 2025 07:23:22 -0800 Subject: [PATCH 014/432] [CodeGen] NFC: Change order of checks in MachineInstr->isDead() (#124207) [[Change-Id: Ic349022bb99ef91f5396e462ade0366bc772ae02](https://github.com/llvm/llvm-project/pull/123531)](https://github.com/llvm/llvm-project/pull/123531) moved isDead() from DeadMachineInstrElim to MachineInstr . In the process of moving, I reordered the checks to improve chances of early exit, but this has caused a slight increase in compile time. This PR reverts back to the original order of checks. --- llvm/lib/CodeGen/MachineInstr.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index 0f7f525fa479e..a9f756b684360 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -1353,18 +1353,6 @@ bool MachineInstr::wouldBeTriviallyDead() const { bool MachineInstr::isDead(const MachineRegisterInfo &MRI, LiveRegUnits *LivePhysRegs) const { - // Technically speaking inline asm without side effects and no defs can still - // be deleted. But there is so much bad inline asm code out there, we should - // let them be. - if (isInlineAsm()) - return false; - - // If we suspect this instruction may have some side-effects, then we say - // this instruction cannot be dead. - // FIXME: See issue #105950 for why LIFETIME markers are considered dead here. - if (!isLifetimeMarker() && !wouldBeTriviallyDead()) - return false; - // Instructions without side-effects are dead iff they only define dead regs. // This function is hot and this loop returns early in the common case, // so only perform additional checks before this if absolutely necessary. @@ -1385,7 +1373,19 @@ bool MachineInstr::isDead(const MachineRegisterInfo &MRI, } } - return true; + // Technically speaking inline asm without side effects and no defs can still + // be deleted. But there is so much bad inline asm code out there, we should + // let them be. + if (isInlineAsm()) + return false; + + // FIXME: See issue #105950 for why LIFETIME markers are considered dead here. + if (isLifetimeMarker()) + return true; + + // If there are no defs with uses, then we call the instruction dead so long + // as we do not suspect it may have sideeffects. + return wouldBeTriviallyDead(); } static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, From a94226f9e6f5be4d6978134e7813f22b0510f3d4 Mon Sep 17 00:00:00 2001 From: Eric Astor Date: Fri, 24 Jan 2025 10:30:10 -0500 Subject: [PATCH 015/432] [llvm-ml] Remove unsafe getCurrentSegmentOnly() call (#123355) This call was made unsafe recently, but was not fixed in db48f1a1764023f8efeb055e343b967d1eb37d19 (the commit that fixed the parallel code in AsmParser.cpp). Fixes #123189 --- llvm/lib/MC/MCParser/COFFMasmParser.cpp | 3 +++ llvm/lib/MC/MCParser/MasmParser.cpp | 3 ++- llvm/test/tools/llvm-ml/bare_proc_error.asm | 7 +++++++ llvm/test/tools/llvm-ml/no_section_error.asm | 4 ++++ 4 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-ml/bare_proc_error.asm create mode 100644 llvm/test/tools/llvm-ml/no_section_error.asm diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index c323e64a40aee..8464a2392680b 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -441,6 +441,9 @@ bool COFFMasmParser::parseDirectiveOption(StringRef Directive, SMLoc Loc) { /// statements /// label "endproc" bool COFFMasmParser::parseDirectiveProc(StringRef Directive, SMLoc Loc) { + if (!getStreamer().getCurrentFragment()) + return Error(getTok().getLoc(), "expected section directive"); + StringRef Label; if (getParser().parseIdentifier(Label)) return Error(Loc, "expected identifier for procedure"); diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 78261c1f9fedb..b2c956e0a4598 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -1454,7 +1454,8 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) { } bool MasmParser::checkForValidSection() { - if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) { + if (!ParsingMSInlineAsm && !(getStreamer().getCurrentFragment() && + getStreamer().getCurrentSectionOnly())) { Out.initSections(false, getTargetParser().getSTI()); return Error(getTok().getLoc(), "expected section directive before assembly directive"); diff --git a/llvm/test/tools/llvm-ml/bare_proc_error.asm b/llvm/test/tools/llvm-ml/bare_proc_error.asm new file mode 100644 index 0000000000000..59668edafccf1 --- /dev/null +++ b/llvm/test/tools/llvm-ml/bare_proc_error.asm @@ -0,0 +1,7 @@ +; RUN: not llvm-ml -filetype=s %s /Fo /dev/null 2>&1 | FileCheck %s + +; CHECK: :[[# @LINE+1]]:1: error: expected section directive +foo PROC +; CHECK: :[[# @LINE+1]]:6: error: expected section directive before assembly directive + ret +foo ENDP diff --git a/llvm/test/tools/llvm-ml/no_section_error.asm b/llvm/test/tools/llvm-ml/no_section_error.asm new file mode 100644 index 0000000000000..65c111908b81a --- /dev/null +++ b/llvm/test/tools/llvm-ml/no_section_error.asm @@ -0,0 +1,4 @@ +; RUN: not llvm-ml -filetype=s %s /Fo /dev/null 2>&1 | FileCheck %s + +; CHECK: :[[# @LINE + 1]]:6: error: expected section directive before assembly directive in 'BYTE' directive +BYTE 2, 3, 4 From ec66c4af09263e68d800971906e60afc27d54a06 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 24 Jan 2025 10:44:00 -0500 Subject: [PATCH 016/432] [AMDGPU][True16][CodeGen] true16 codegen pattern for f16 canonicalize (#122000) true16 codegen pattern for f16 canonicalize --- llvm/lib/Target/AMDGPU/SIInstructions.td | 33 +- .../GlobalISel/inst-select-fcanonicalize.mir | 43 ++- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 363 ++++++++++++------ 3 files changed, 304 insertions(+), 135 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index da44faac2f910..15c77c2a723e4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3093,7 +3093,7 @@ foreach vt = [f16, v2f16, f32, v2f32, f64] in { // Prefer selecting to max when legal, but using mul is always valid. let AddedComplexity = -5 in { -let OtherPredicates = [NotHasTrue16BitInsts] in { +let True16Predicate = NotHasTrue16BitInsts in { def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) @@ -3103,9 +3103,21 @@ def : GCNPat< (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) >; -} // End OtherPredicates +} // End True16Predicate -let OtherPredicates = [HasTrue16BitInsts] in { +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) +>; + +def : GCNPat< + (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), + (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) +>; +} // End True16Predicate + +let True16Predicate = UseFakeTrue16Insts in { def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) @@ -3115,7 +3127,7 @@ def : GCNPat< (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) >; -} // End OtherPredicates +} // End True16Predicate def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), @@ -3173,13 +3185,22 @@ multiclass SelectCanonicalizeAsMax< def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { - let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]); + let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]); + let True16Predicate = NotHasTrue16BitInsts; + } + + def : GCNPat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { + let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]); + let True16Predicate = UseRealTrue16Insts; } def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { - let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]); + let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]); + let True16Predicate = UseFakeTrue16Insts; } def : GCNPat< diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir index c07a2b0b85921..d32634806f7bd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir @@ -2,7 +2,8 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=-real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- @@ -38,12 +39,20 @@ body: | ; GFX10-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]] ; - ; GFX11-LABEL: name: fcanonicalize_f16_denorm - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcanonicalize_f16_denorm + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY1]], 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcanonicalize_f16_denorm + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FCANONICALIZE %1 @@ -84,12 +93,20 @@ body: | ; GFX10-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]] ; - ; GFX11-LABEL: name: fcanonicalize_f16_flush - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcanonicalize_f16_flush + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY1]], 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcanonicalize_f16_flush + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FCANONICALIZE %1 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 3c70883f09d2c..b4dbe0e7be924 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 @@ -96,16 +97,27 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_canonicalize_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: global_store_b16 v[0:1], v0, off -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_canonicalize_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-FAKE16-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, ptr addrspace(1) undef @@ -147,16 +159,29 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: s_test_canonicalize_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_test_canonicalize_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_test_canonicalize_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, s2, s2 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, ptr addrspace(1) %out @@ -239,16 +264,27 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_canonicalize_fabs_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_canonicalize_fabs_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, |v0.l|, |v0.l| +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_fabs_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs) @@ -293,16 +329,27 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) %val.fabs.fneg = fneg half %val.fabs @@ -348,16 +395,27 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_canonicalize_fneg_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fneg = fneg half %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) @@ -402,16 +460,27 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fneg = fneg half %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) @@ -456,16 +525,27 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) %val.fabs.fneg = fneg half %val.fabs @@ -2325,13 +2405,21 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half %val, i32 0 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -2358,13 +2446,21 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_test_canonicalize_undef_reg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half %val, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -2513,13 +2609,21 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 { ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, 2.0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 2.0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 2.0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <2 x half> undef, half %val, i32 0 %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) @@ -2549,13 +2653,21 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 2.0 ; CI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_test_canonicalize_k_reg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, 2.0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, 2.0, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, 2.0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <2 x half> undef, half 2.0, i32 0 %vec1 = insertelement <2 x half> %vec0, half %val, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) @@ -2635,14 +2747,23 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <4 x half> undef, half %val, i32 0 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec) ret <4 x half> %canonicalized @@ -2725,15 +2846,25 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <4 x half> undef, half %val0, i32 0 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2 %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3 From 5d2393a222c751723b0906485bf90a28dd4e564b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 24 Jan 2025 08:09:20 -0800 Subject: [PATCH 017/432] [InstCombine] Avoid repeated hash lookups (NFC) (#124243) --- .../Transforms/InstCombine/InstCombineVectorOps.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index a9ae09b8dba43..6860a7cd07b78 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -3060,14 +3060,10 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; assert(SrcElemsPerTgtElem); BegIdx /= SrcElemsPerTgtElem; - bool BCAlreadyExists = NewBCs.contains(CastSrcTy); - auto *NewBC = - BCAlreadyExists - ? NewBCs[CastSrcTy] - : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); - if (!BCAlreadyExists) - NewBCs[CastSrcTy] = NewBC; - auto *Ext = Builder.CreateExtractElement(NewBC, BegIdx, + auto [It, Inserted] = NewBCs.try_emplace(CastSrcTy); + if (Inserted) + It->second = Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); + auto *Ext = Builder.CreateExtractElement(It->second, BegIdx, SVI.getName() + ".extract"); // The shufflevector isn't being replaced: the bitcast that used it // is. InstCombine will visit the newly-created instructions. From 37bf0a10fb4cee10f4acbb7da453e7c19c8ee599 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 24 Jan 2025 17:10:10 +0100 Subject: [PATCH 018/432] [SCEV] Add test for #123550 (NFC) --- .../test/Analysis/ScalarEvolution/pr123550.ll | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 llvm/test/Analysis/ScalarEvolution/pr123550.ll diff --git a/llvm/test/Analysis/ScalarEvolution/pr123550.ll b/llvm/test/Analysis/ScalarEvolution/pr123550.ll new file mode 100644 index 0000000000000..c1f2051248a12 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/pr123550.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -disable-output -passes='print' < %s 2>&1 | FileCheck %s + +; FIXME: This is a miscompile. +define i32 @test() { +; CHECK-LABEL: 'test' +; CHECK-NEXT: Classifying expressions for: @test +; CHECK-NEXT: %phi = phi i32 [ -173, %bb ], [ %sub, %loop ] +; CHECK-NEXT: --> (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))) U: empty-set S: empty-set Exits: -173 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv2 = phi i32 [ 1, %bb ], [ %iv2.inc, %loop ] +; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %srem = srem i32 729259140, %phi +; CHECK-NEXT: --> (729259140 + (-1 * (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))) * (729259140 /u (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32)))))) U: empty-set S: empty-set Exits: 729259140 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %trunc = trunc i32 %iv2 to i8 +; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %urem = urem i8 -83, %trunc +; CHECK-NEXT: --> (-83 + ((-83 /u {1,+,1}<%loop>) * {-1,+,-1}<%loop>)) U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %zext = zext i8 %urem to i32 +; CHECK-NEXT: --> (zext i8 (-83 + ((-83 /u {1,+,1}<%loop>) * {-1,+,-1}<%loop>)) to i32) U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %sub = sub i32 0, %zext +; CHECK-NEXT: --> (-1 * (zext i8 (-83 + ((-83 /u {1,+,1}<%loop>) * {-1,+,-1}<%loop>)) to i32)) U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv2.inc = add i32 %iv2, 1 +; CHECK-NEXT: --> {2,+,1}<%loop> U: [2,3) S: [2,3) Exits: 2 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %srem.lcssa = phi i32 [ %srem, %loop ] +; CHECK-NEXT: --> (729259140 + (-1 * (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))) * (729259140 /u (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32)))))) U: empty-set S: empty-set --> 729259140 U: [729259140,729259141) S: [729259140,729259141) +; CHECK-NEXT: Determining loop execution counts for: @test +; CHECK-NEXT: Loop %loop: backedge-taken count is i32 0 +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 0 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is i32 0 +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +bb: + br label %loop + +loop: + %phi = phi i32 [ -173, %bb ], [ %sub, %loop ] + %iv2 = phi i32 [ 1, %bb ], [ %iv2.inc, %loop ] + %srem = srem i32 729259140, %phi + %trunc = trunc i32 %iv2 to i8 + %urem = urem i8 -83, %trunc + %zext = zext i8 %urem to i32 + %sub = sub i32 0, %zext + %iv2.inc = add i32 %iv2, 1 + %icmp = icmp eq i32 %zext, 0 + br i1 %icmp, label %exit, label %loop + +exit: + %srem.lcssa = phi i32 [ %srem, %loop ] + ret i32 %srem.lcssa +} From 256f40d0e6b2beb0e951b0f5f836847223c5695c Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 24 Jan 2025 10:17:20 -0600 Subject: [PATCH 019/432] [libc] Use the NVIDIA device allocator for GPU malloc (#124277) Summary: This is a blocker on another patch in the OpenMP runtime. The problem is that NVIDIA truly doesn't handle RPC-based allocations very well. It cannot reliably update the MMU while a kernel is running and it will usually deadlock if called from a separate thread due to internal use of TLS. This patch just removes the definition of `malloc` and `free` for NVPTX. The result here is that they will be undefined, which is the cue for the `nvlink` linker to define them for us. So, as far as `libc` is concerned it still implements malloc. --- libc/src/stdlib/gpu/free.cpp | 4 ++++ libc/src/stdlib/gpu/malloc.cpp | 4 ++++ libc/test/src/stdlib/CMakeLists.txt | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp index 1f0e9ec735974..6ef9d718315a5 100644 --- a/libc/src/stdlib/gpu/free.cpp +++ b/libc/src/stdlib/gpu/free.cpp @@ -14,6 +14,10 @@ namespace LIBC_NAMESPACE_DECL { +// FIXME: For now we just default to the NVIDIA device allocator which is +// always available on NVPTX targets. This will be implemented fully later. +#ifndef LIBC_TARGET_ARCH_IS_NVPTX LLVM_LIBC_FUNCTION(void, free, (void *ptr)) { gpu::deallocate(ptr); } +#endif } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp index 54f2d8843996e..b5909cb9cb4d0 100644 --- a/libc/src/stdlib/gpu/malloc.cpp +++ b/libc/src/stdlib/gpu/malloc.cpp @@ -14,8 +14,12 @@ namespace LIBC_NAMESPACE_DECL { +// FIXME: For now we just default to the NVIDIA device allocator which is +// always available on NVPTX targets. This will be implemented fully later. +#ifndef LIBC_TARGET_ARCH_IS_NVPTX LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) { return gpu::allocate(size); } +#endif } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 8cc0428632ba3..aba76833be9d4 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -420,7 +420,8 @@ if(LLVM_LIBC_FULL_BUILD) ) # Only baremetal and GPU has an in-tree 'malloc' implementation. - if(LIBC_TARGET_OS_IS_BAREMETAL OR LIBC_TARGET_OS_IS_GPU) + if((LIBC_TARGET_OS_IS_BAREMETAL OR LIBC_TARGET_OS_IS_GPU) AND + NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( malloc_test HERMETIC_TEST_ONLY From 7842374103b26933d71a8fe354cd4d8715d55b1c Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 24 Jan 2025 08:20:35 -0800 Subject: [PATCH 020/432] [NFC][TableGen] Emit nested namespaces in InstrInfoEmitter (#124210) - Emit C++17 nested namespaces in InstrInfoEmitter. --- llvm/utils/TableGen/InstrInfoEmitter.cpp | 89 +++++++----------------- 1 file changed, 27 insertions(+), 62 deletions(-) diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 7811734d5fdac..12401a2f246a1 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -283,7 +283,6 @@ void InstrInfoEmitter::emitOperandNameMappings( raw_ostream &OS, const CodeGenTarget &Target, ArrayRef NumberedInstructions) { StringRef Namespace = Target.getInstNamespace(); - std::string OpNameNS = "OpName"; // Map of operand names to their enumeration value. This will be used to // generate the OpName enum. std::map Operands; @@ -293,24 +292,19 @@ void InstrInfoEmitter::emitOperandNameMappings( OS << "#ifdef GET_INSTRINFO_OPERAND_ENUM\n"; OS << "#undef GET_INSTRINFO_OPERAND_ENUM\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; - OS << "namespace " << OpNameNS << " {\n"; + OS << "namespace llvm::" << Namespace << "::OpName {\n"; OS << "enum {\n"; for (const auto &Op : Operands) OS << " " << Op.first << " = " << Op.second << ",\n"; OS << " OPERAND_LAST"; OS << "\n};\n"; - OS << "} // end namespace OpName\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "::OpName\n"; OS << "#endif //GET_INSTRINFO_OPERAND_ENUM\n\n"; OS << "#ifdef GET_INSTRINFO_NAMED_OPS\n"; OS << "#undef GET_INSTRINFO_NAMED_OPS\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; OS << "LLVM_READONLY\n"; OS << "int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx) {\n"; if (!Operands.empty()) { @@ -343,8 +337,7 @@ void InstrInfoEmitter::emitOperandNameMappings( OS << " return -1;\n"; } OS << "}\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif //GET_INSTRINFO_NAMED_OPS\n\n"; } @@ -365,9 +358,7 @@ void InstrInfoEmitter::emitOperandTypeMappings( OS << "#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n"; OS << "#undef GET_INSTRINFO_OPERAND_TYPES_ENUM\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; - OS << "namespace OpTypes {\n"; + OS << "namespace llvm::" << Namespace << "::OpTypes {\n"; OS << "enum OperandType {\n"; unsigned EnumVal = 0; @@ -382,15 +373,12 @@ void InstrInfoEmitter::emitOperandTypeMappings( OS << " OPERAND_TYPE_LIST_END" << "\n};\n"; - OS << "} // end namespace OpTypes\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "::OpTypes\n"; OS << "#endif // GET_INSTRINFO_OPERAND_TYPES_ENUM\n\n"; OS << "#ifdef GET_INSTRINFO_OPERAND_TYPE\n"; OS << "#undef GET_INSTRINFO_OPERAND_TYPE\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; OS << "LLVM_READONLY\n"; OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n"; auto getInstrName = [&](int I) -> StringRef { @@ -465,14 +453,12 @@ void InstrInfoEmitter::emitOperandTypeMappings( OS << " llvm_unreachable(\"No instructions defined\");\n"; } OS << "}\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n"; OS << "#ifdef GET_INSTRINFO_MEM_OPERAND_SIZE\n"; OS << "#undef GET_INSTRINFO_MEM_OPERAND_SIZE\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; OS << "LLVM_READONLY\n"; OS << "static int getMemOperandSize(int OpType) {\n"; OS << " switch (OpType) {\n"; @@ -490,8 +476,7 @@ void InstrInfoEmitter::emitOperandTypeMappings( OS << " return " << KV.first << ";\n\n"; } OS << " }\n}\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif // GET_INSTRINFO_MEM_OPERAND_SIZE\n\n"; } @@ -526,8 +511,7 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings( OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n"; OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; OS << "LLVM_READONLY static unsigned\n"; OS << "getLogicalOperandSize(uint16_t Opcode, uint16_t LogicalOpIdx) {\n"; if (!InstMap.empty()) { @@ -577,8 +561,7 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings( OS << " return S;\n"; OS << "}\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n\n"; } @@ -619,8 +602,7 @@ void InstrInfoEmitter::emitLogicalOperandTypeMappings( OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n"; OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Namespace << " {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; OS << "LLVM_READONLY static int\n"; OS << "getLogicalOperandType(uint16_t Opcode, uint16_t LogicalOpIdx) {\n"; if (!InstMap.empty()) { @@ -666,8 +648,7 @@ void InstrInfoEmitter::emitLogicalOperandTypeMappings( OS << " return -1;\n"; } OS << "}\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n\n"; } @@ -701,8 +682,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS, OS << "#ifdef GET_INSTRINFO_MC_HELPERS\n"; OS << "#undef GET_INSTRINFO_MC_HELPERS\n\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << TargetName << "_MC {\n\n"; + OS << "namespace llvm::" << TargetName << "_MC {\n"; PredicateExpander PE(TargetName); PE.setExpandForMC(true); @@ -716,8 +696,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS, OS << "\n}\n\n"; } - OS << "} // end namespace " << TargetName << "_MC\n"; - OS << "} // end namespace llvm\n\n"; + OS << "} // end namespace llvm::" << TargetName << "_MC\n"; OS << "#endif // GET_GENISTRINFO_MC_HELPERS\n\n"; } @@ -743,8 +722,7 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, << "#endif\n"; OS << "#ifdef GET_COMPUTE_FEATURES\n" << "#undef GET_COMPUTE_FEATURES\n" - << "namespace llvm {\n" - << "namespace " << Target.getName() << "_MC {\n\n"; + << "namespace llvm::" << Target.getName() << "_MC {\n"; // Emit the subtarget feature enumeration. SubtargetFeatureInfo::emitSubtargetFeatureBitEnumeration(SubtargetFeatures, @@ -827,14 +805,12 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, << " return FeatureBitsets[RequiredFeaturesRefs[Opcode]];\n" << "}\n\n"; - OS << "} // end namespace " << Target.getName() << "_MC\n" - << "} // end namespace llvm\n" + OS << "} // end namespace llvm::" << Target.getName() << "_MC\n" << "#endif // GET_COMPUTE_FEATURES\n\n"; OS << "#ifdef GET_AVAILABLE_OPCODE_CHECKER\n" << "#undef GET_AVAILABLE_OPCODE_CHECKER\n" - << "namespace llvm {\n" - << "namespace " << Target.getName() << "_MC {\n"; + << "namespace llvm::" << Target.getName() << "_MC {\n"; OS << "bool isOpcodeAvailable(" << "unsigned Opcode, const FeatureBitset &Features) {\n" << " FeatureBitset AvailableFeatures = " @@ -846,16 +822,14 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, << " RequiredFeatures;\n" << " return !MissingFeatures.any();\n" << "}\n"; - OS << "} // end namespace " << Target.getName() << "_MC\n" - << "} // end namespace llvm\n" + OS << "} // end namespace llvm::" << Target.getName() << "_MC\n" << "#endif // GET_AVAILABLE_OPCODE_CHECKER\n\n"; OS << "#ifdef ENABLE_INSTR_PREDICATE_VERIFIER\n" << "#undef ENABLE_INSTR_PREDICATE_VERIFIER\n" << "#include \n\n"; - OS << "namespace llvm {\n"; - OS << "namespace " << Target.getName() << "_MC {\n\n"; + OS << "namespace llvm::" << Target.getName() << "_MC {\n"; // Emit the name table for error messages. OS << "#ifndef NDEBUG\n"; @@ -886,8 +860,7 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, << " }\n" << "#endif // NDEBUG\n"; OS << "}\n"; - OS << "} // end namespace " << Target.getName() << "_MC\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"; OS << "#endif // ENABLE_INSTR_PREDICATE_VERIFIER\n\n"; } @@ -1318,17 +1291,14 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) { OS << "#ifdef GET_INSTRINFO_ENUM\n"; OS << "#undef GET_INSTRINFO_ENUM\n"; - OS << "namespace llvm {\n\n"; - const CodeGenTarget &Target = CDP.getTargetInfo(); - - // We must emit the PHI opcode first... StringRef Namespace = Target.getInstNamespace(); if (Namespace.empty()) PrintFatalError("No instructions defined!"); - OS << "namespace " << Namespace << " {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; + OS << " enum {\n"; unsigned Num = 0; for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) @@ -1336,24 +1306,19 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) { << "\t= " << (Num = Target.getInstrIntValue(Inst->TheDef)) << ",\n"; OS << " INSTRUCTION_LIST_END = " << Num + 1 << "\n"; OS << " };\n\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif // GET_INSTRINFO_ENUM\n\n"; OS << "#ifdef GET_INSTRINFO_SCHED_ENUM\n"; OS << "#undef GET_INSTRINFO_SCHED_ENUM\n"; - OS << "namespace llvm {\n\n"; - OS << "namespace " << Namespace << " {\n"; - OS << "namespace Sched {\n"; + OS << "namespace llvm::" << Namespace << "::Sched {\n\n"; OS << " enum {\n"; Num = 0; for (const auto &Class : SchedModels.explicit_classes()) OS << " " << Class.Name << "\t= " << Num++ << ",\n"; OS << " SCHED_LIST_END = " << Num << "\n"; OS << " };\n"; - OS << "} // end namespace Sched\n"; - OS << "} // end namespace " << Namespace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << Namespace << "::Sched\n"; OS << "#endif // GET_INSTRINFO_SCHED_ENUM\n\n"; } From cccb55491223cd410cb2f83973377dd75757cb60 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Fri, 24 Jan 2025 11:33:05 -0500 Subject: [PATCH 021/432] [lldb] Remove unused posix_openpt function definition for Android (#124257) This was for the wrapper function that was in source/Host/android/LibcGlue.cpp. Android added support 10+ years ago. --- lldb/source/Host/common/PseudoTerminal.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lldb/source/Host/common/PseudoTerminal.cpp b/lldb/source/Host/common/PseudoTerminal.cpp index d53327973eb27..53e91aff212a4 100644 --- a/lldb/source/Host/common/PseudoTerminal.cpp +++ b/lldb/source/Host/common/PseudoTerminal.cpp @@ -27,10 +27,6 @@ #include #endif -#if defined(__ANDROID__) -int posix_openpt(int flags); -#endif - using namespace lldb_private; // PseudoTerminal constructor From 3da7de34a2bcfeef73747a9796652f6bff225de3 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Fri, 24 Jan 2025 08:49:35 -0800 Subject: [PATCH 022/432] [flang][runtime] Disable optimization for traceback related functions. (#124172) The backtrace may at least print the backtrace name in the call stack, but this does not happen with the release builds of the runtime. Surprisingly, specifying "no-omit-frame-pointer" did not work with GCC, so I decided to fall back to -O0 for these functions. --- flang/include/flang/Common/api-attrs.h | 11 +++++++++++ flang/runtime/stop.cpp | 14 +++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/flang/include/flang/Common/api-attrs.h b/flang/include/flang/Common/api-attrs.h index d73e60996bc81..1ee91ca8e0d9d 100644 --- a/flang/include/flang/Common/api-attrs.h +++ b/flang/include/flang/Common/api-attrs.h @@ -178,4 +178,15 @@ #define RT_DEVICE_NOINLINE_HOST_INLINE inline #endif +/* RT_OPTNONE_ATTR allows disabling optimizations per function. */ +#if __has_attribute(optimize) +/* GCC style. */ +#define RT_OPTNONE_ATTR __attribute__((optimize("O0"))) +#elif __has_attribute(optnone) +/* Clang style. */ +#define RT_OPTNONE_ATTR __attribute__((optnone)) +#else +#define RT_OPTNONE_ATTR +#endif + #endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */ diff --git a/flang/runtime/stop.cpp b/flang/runtime/stop.cpp index a7be8a082e026..f8c180e0aaffa 100644 --- a/flang/runtime/stop.cpp +++ b/flang/runtime/stop.cpp @@ -157,7 +157,7 @@ void RTNAME(PauseStatementText)(const char *code, std::size_t length) { std::exit(status); } -static void PrintBacktrace() { +static RT_NOINLINE_ATTR void PrintBacktrace() { #ifdef HAVE_BACKTRACE // TODO: Need to parse DWARF information to print function line numbers constexpr int MAX_CALL_STACK{999}; @@ -165,8 +165,12 @@ static void PrintBacktrace() { int nptrs{(int)backtrace(buffer, MAX_CALL_STACK)}; if (char **symbols{backtrace_symbols(buffer, nptrs)}) { - for (int i = 0; i < nptrs; i++) { - Fortran::runtime::Terminator{}.PrintCrashArgs("#%d %s\n", i, symbols[i]); + // Skip the PrintBacktrace() frame, as it is just a utility. + // It makes sense to start printing the backtrace + // from Abort() or backtrace(). + for (int i = 1; i < nptrs; i++) { + Fortran::runtime::Terminator{}.PrintCrashArgs( + "#%d %s\n", i - 1, symbols[i]); } free(symbols); } @@ -179,14 +183,14 @@ static void PrintBacktrace() { #endif } -[[noreturn]] void RTNAME(Abort)() { +[[noreturn]] RT_OPTNONE_ATTR void RTNAME(Abort)() { #ifdef HAVE_BACKTRACE PrintBacktrace(); #endif std::abort(); } -void FORTRAN_PROCEDURE_NAME(backtrace)() { PrintBacktrace(); } +RT_OPTNONE_ATTR void FORTRAN_PROCEDURE_NAME(backtrace)() { PrintBacktrace(); } [[noreturn]] void RTNAME(ReportFatalUserError)( const char *message, const char *source, int line) { From a976036a100b7dd459b6cabac96159875fcd513d Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 24 Jan 2025 11:57:43 -0500 Subject: [PATCH 023/432] [libc][NFC] Remove extra ; in exhaustive_test.h. (#124216) These cause warnings when running check-libc. --- libc/test/src/math/exhaustive/exhaustive_test.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/test/src/math/exhaustive/exhaustive_test.h b/libc/test/src/math/exhaustive/exhaustive_test.h index 94489d2e55daa..5912f7a27dc52 100644 --- a/libc/test/src/math/exhaustive/exhaustive_test.h +++ b/libc/test/src/math/exhaustive/exhaustive_test.h @@ -225,7 +225,7 @@ struct LlvmLibcExhaustiveMathTest std::cout << "-- Testing for FE_TOWARDZERO in range [0x" << std::hex << start << ", 0x" << stop << ") --" << std::dec << std::endl; test_full_range(mpfr::RoundingMode::TowardZero, start, stop); - }; + } void test_full_range_all_roundings(StorageType x_start, StorageType x_stop, StorageType y_start, StorageType y_stop) { @@ -252,7 +252,7 @@ struct LlvmLibcExhaustiveMathTest << ", 0x" << y_stop << ") --" << std::dec << std::endl; test_full_range(mpfr::RoundingMode::TowardZero, x_start, x_stop, y_start, y_stop); - }; + } }; template Func> From ba6774f997ee28157b0a3b8816cc76b94ed1da17 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Fri, 24 Jan 2025 18:09:48 +0100 Subject: [PATCH 024/432] [mlir][xegpu] Fix verifier diagnostic recursion (#124148) Uses global diagnostic message in operation verifier to avoid infinite recursion on a warning. Emitting diagnostics through the operation under verification creates a loop where verifier runs again before printing the message. --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 15c435f1fa257..81f46f941785a 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -294,7 +294,7 @@ LogicalResult LoadNdOp::verify() { if (valid) transpose(trans, tdescShape); else - emitWarning("Invalid transpose attr. It is ignored."); + mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored."; } if (getPacked()) { @@ -304,8 +304,9 @@ LogicalResult LoadNdOp::verify() { tdescShape[axis] /= vnni_factor; tdescShape.push_back(vnni_factor); } else { - emitWarning("Invalid Packed Attr. It is ignored (available for 2D " - "TensorDesc only)."); + mlir::emitWarning(getLoc()) + << "Invalid Packed Attr. It is ignored (available for 2D " + "TensorDesc only)."; } } From d88293d8a2005b19f89a86252c60102cec6c9b01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 24 Jan 2025 17:15:06 +0000 Subject: [PATCH 025/432] [mlir][vector] Disable `BreakDownVectorBitCast` for scalable vectors (#122725) `BreakDownVectorBitCast` leverages * `vector.extract_strided_slices` + `vector.insert_strided_slices` As these Ops do not support extracting scalable sub-vectors (i.e. extracting/inserting a fraction of a scalable dim), it's best to bail out. --- .../Dialect/Vector/Transforms/VectorTransforms.cpp | 7 +++++++ .../Dialect/Vector/vector-break-down-bitcast.mlir | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index d9be8d0e578ae..275f11160487a 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -906,6 +906,13 @@ struct BreakDownVectorBitCast : public OpRewritePattern { VectorType castDstType = bitcastOp.getResultVectorType(); assert(castSrcType.getRank() == castDstType.getRank()); + // This transformation builds on top of + // vector.{extract|insert}_strided_slice, which do not support + // extracting/inserting "scallable sub-vectors". Bail out. + if (castSrcType.isScalable()) + return rewriter.notifyMatchFailure(bitcastOp, + "Scalable vectors are not supported"); + // Only support rank 1 case for now. if (castSrcType.getRank() != 1) return failure(); diff --git a/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir b/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir index fbb2f7605e649..173388f63ecda 100644 --- a/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir +++ b/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir @@ -39,3 +39,14 @@ func.func @bitcast_i8_to_i32(%input: vector<16xi8>) -> vector<4xi32> { // CHECK: %[[CAST3:.+]] = vector.bitcast %[[EXTRACT3]] : vector<4xi8> to vector<1xi32> // CHECK: %[[INSERT3:.+]] = vector.insert_strided_slice %[[CAST3]], %[[INSERT2]] {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> // CHECK: return %[[INSERT3]] + +// ----- + +// Scalable vectors are not supported! + +// CHECK-LABEL: func.func @bitcast_scalable_negative +// CHECK: vector.bitcast +func.func @bitcast_scalable_negative(%input: vector<[8]xf16>) -> vector<[4]xf32> { + %0 = vector.bitcast %input : vector<[8]xf16> to vector<[4]xf32> + return %0: vector<[4]xf32> +} From 474f5d2aefb44430b89ed72774a3c1d26a0adfb1 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 24 Jan 2025 17:22:27 +0000 Subject: [PATCH 026/432] [FMV][AArch64] Remove features predres and ls64. (#124266) These cannot be detected by reading the ID_AA64ISAR1_EL1 register since their corresponding bitfields are hidden. Additionally the instructions that these features enable are unusable from EL0. ACLE: https://github.com/ARM-software/acle/pull/382 --- .../CodeGen/AArch64/cpu-supports-target.c | 6 +-- clang/test/CodeGen/AArch64/cpu-supports.c | 6 +-- clang/test/CodeGen/AArch64/fmv-dependencies.c | 8 ---- clang/test/CodeGen/AArch64/fmv-features.c | 8 ---- clang/test/CodeGen/AArch64/fmv-priority.c | 10 ++--- .../test/CodeGen/attr-target-clones-aarch64.c | 18 ++++----- clang/test/CodeGen/attr-target-version.c | 38 +++++++++---------- .../CodeGenCXX/attr-target-clones-aarch64.cpp | 20 +++++----- clang/test/Sema/attr-target-version.c | 2 +- .../builtins/cpu_model/AArch64CPUFeatures.inc | 10 ++--- .../builtins/cpu_model/aarch64/fmv/apple.inc | 2 - .../builtins/cpu_model/aarch64/fmv/mrs.inc | 11 ------ .../llvm/TargetParser/AArch64CPUFeatures.inc | 10 ++--- llvm/lib/Target/AArch64/AArch64FMV.td | 2 - 14 files changed, 60 insertions(+), 91 deletions(-) diff --git a/clang/test/CodeGen/AArch64/cpu-supports-target.c b/clang/test/CodeGen/AArch64/cpu-supports-target.c index 6223db7c09253..a39ffd4e4a74d 100644 --- a/clang/test/CodeGen/AArch64/cpu-supports-target.c +++ b/clang/test/CodeGen/AArch64/cpu-supports-target.c @@ -91,8 +91,8 @@ // CHECK-NEXT: br label %[[RETURN]] // CHECK: [[IF_ELSE16]]: // CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 10836786603360256 -// CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 10836786603360256 +// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 1688849860263936 +// CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 1688849860263936 // CHECK-NEXT: [[TMP39:%.*]] = and i1 true, [[TMP38]] // CHECK-NEXT: br i1 [[TMP39]], label %[[IF_THEN17:.*]], label %[[IF_ELSE18:.*]] // CHECK: [[IF_THEN17]]: @@ -142,7 +142,7 @@ int check_all_features() { return 8; else if (__builtin_cpu_supports("sme+memtag+sb")) return 9; - else if (__builtin_cpu_supports("predres+ssbs+bti+ls64")) + else if (__builtin_cpu_supports("ssbs+bti")) return 10; else if (__builtin_cpu_supports("wfxt+sme-f64f64")) return 11; diff --git a/clang/test/CodeGen/AArch64/cpu-supports.c b/clang/test/CodeGen/AArch64/cpu-supports.c index 406201781d480..5691901bcd98f 100644 --- a/clang/test/CodeGen/AArch64/cpu-supports.c +++ b/clang/test/CodeGen/AArch64/cpu-supports.c @@ -27,8 +27,8 @@ // CHECK-NEXT: br label [[RETURN]] // CHECK: if.end2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 171141184020873984 -// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 171141184020873984 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 162133984766132992 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 162133984766132992 // CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] // CHECK-NEXT: br i1 [[TMP11]], label [[IF_THEN3:%.*]], label [[IF_END4:%.*]] // CHECK: if.then3: @@ -53,7 +53,7 @@ int main(void) { if (__builtin_cpu_supports("sve2-aes+memtag")) return 2; - if (__builtin_cpu_supports("sme2+ls64+wfxt")) + if (__builtin_cpu_supports("sme2+wfxt")) return 3; if (__builtin_cpu_supports("avx2")) diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c index 8dda3b647fcd0..7cfab7de41a9d 100644 --- a/clang/test/CodeGen/AArch64/fmv-dependencies.c +++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c @@ -60,9 +60,6 @@ __attribute__((target_version("i8mm"))) int fmv(void) { return 0; } // CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] { __attribute__((target_version("jscvt"))) int fmv(void) { return 0; } -// CHECK: define dso_local i32 @fmv._Mls64() #[[ls64:[0-9]+]] { -__attribute__((target_version("ls64"))) int fmv(void) { return 0; } - // CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] { __attribute__((target_version("lse"))) int fmv(void) { return 0; } @@ -72,9 +69,6 @@ __attribute__((target_version("memtag"))) int fmv(void) { return 0; } // CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] { __attribute__((target_version("mops"))) int fmv(void) { return 0; } -// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] { -__attribute__((target_version("predres"))) int fmv(void) { return 0; } - // CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] { __attribute__((target_version("rcpc"))) int fmv(void) { return 0; } @@ -169,11 +163,9 @@ int caller() { // CHECK: attributes #[[frintts]] = { {{.*}} "target-features"="+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[i8mm]] = { {{.*}} "target-features"="+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a" -// CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fp-armv8,+ls64,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fp-armv8,+lse,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+v8a" -// CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+predres,+v8a" // CHECK: attributes #[[rcpc]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a" // CHECK: attributes #[[rcpc2]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+v8a" // CHECK: attributes #[[rcpc3]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+rcpc3,+v8a" diff --git a/clang/test/CodeGen/AArch64/fmv-features.c b/clang/test/CodeGen/AArch64/fmv-features.c index d191f8187eb6b..fdc64e2cd395c 100644 --- a/clang/test/CodeGen/AArch64/fmv-features.c +++ b/clang/test/CodeGen/AArch64/fmv-features.c @@ -58,9 +58,6 @@ __attribute__((target_version("i8mm"))) int fmv(void) { return 0; } // CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] { __attribute__((target_version("jscvt"))) int fmv(void) { return 0; } -// CHECK: define dso_local i32 @fmv._Mls64() #[[ls64:[0-9]+]] { -__attribute__((target_version("ls64"))) int fmv(void) { return 0; } - // CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] { __attribute__((target_version("lse"))) int fmv(void) { return 0; } @@ -70,9 +67,6 @@ __attribute__((target_version("memtag"))) int fmv(void) { return 0; } // CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] { __attribute__((target_version("mops"))) int fmv(void) { return 0; } -// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] { -__attribute__((target_version("predres"))) int fmv(void) { return 0; } - // CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] { __attribute__((target_version("rcpc"))) int fmv(void) { return 0; } @@ -171,11 +165,9 @@ int caller() { // CHECK: attributes #[[frintts]] = {{.*}} "fmv-features"="frintts" // CHECK: attributes #[[i8mm]] = {{.*}} "fmv-features"="i8mm" // CHECK: attributes #[[jscvt]] = {{.*}} "fmv-features"="jscvt" -// CHECK: attributes #[[ls64]] = {{.*}} "fmv-features"="ls64" // CHECK: attributes #[[lse]] = {{.*}} "fmv-features"="lse" // CHECK: attributes #[[memtag]] = {{.*}} "fmv-features"="memtag" // CHECK: attributes #[[mops]] = {{.*}} "fmv-features"="mops" -// CHECK: attributes #[[predres]] = {{.*}} "fmv-features"="predres" // CHECK: attributes #[[rcpc]] = {{.*}} "fmv-features"="rcpc" // CHECK: attributes #[[rcpc2]] = {{.*}} "fmv-features"="rcpc2" // CHECK: attributes #[[rcpc3]] = {{.*}} "fmv-features"="rcpc3" diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c index ff82aef89a33d..c92e0c4e9c3db 100644 --- a/clang/test/CodeGen/AArch64/fmv-priority.c +++ b/clang/test/CodeGen/AArch64/fmv-priority.c @@ -5,7 +5,7 @@ // // MSB LSB // -// sme2 | ls64 | sme | bf16 | | | fp16 | simd | fp +// sme2 | wfxt | sme | bf16 | | | fp16 | simd | fp // -----+------+-----+------+-------+------+------+------+--- // sme2 | | sme | bf16 | rcpc2 | rcpc | fp16 | simd | fp // @@ -13,7 +13,7 @@ // feature can only depend on lower priority features: // https://github.com/ARM-software/acle/pull/376 -__attribute__((target_version("sme2+ls64"))) int fn(void); +__attribute__((target_version("sme2+wfxt"))) int fn(void); __attribute__((target_version("sme2+rcpc2"))) int fn(void); __attribute__((target_version("default"))) int fn(void) { return 0; } @@ -36,12 +36,12 @@ int call() { return fn(); } // CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 153126785511392000 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 153126785511392000 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 162133984766132992 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 162133984766132992 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] // CHECK: [[RESOLVER_RETURN]]: -// CHECK-NEXT: ret ptr @fn._Mls64Msme2 +// CHECK-NEXT: ret ptr @fn._Msme2Mwfxt // CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 144119586269233920 diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c index 9e1588cd48336..ac926f2329cc4 100644 --- a/clang/test/CodeGen/attr-target-clones-aarch64.c +++ b/clang/test/CodeGen/attr-target-clones-aarch64.c @@ -12,7 +12,7 @@ int foo() { return ftc() + ftc_def() + ftc_dup1() + ftc_dup2() + ftc_dup3(); } -inline int __attribute__((target_clones("rng+simd", "rcpc+predres", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; } +inline int __attribute__((target_clones("rng+simd", "rcpc", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; } inline int __attribute__((target_clones("fp16", "fcma+sve2-bitperm", "default"))) ftc_inline2(void); inline int __attribute__((target_clones("bti", "sve+sb"))) ftc_inline3(void) { return 3; } @@ -336,7 +336,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc +// CHECK-LABEL: define {{[^@]+}}@ftc_inline1._Mrcpc // CHECK-SAME: () #[[ATTR13:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 @@ -368,12 +368,12 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 4194304 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4194304 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @ftc_inline1._MpredresMrcpc +// CHECK-NEXT: ret ptr @ftc_inline1._Mrcpc // CHECK: resolver_else2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 769 @@ -793,7 +793,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._Mrcpc // CHECK-MTE-BTI-SAME: () #[[ATTR13:[0-9]+]] { // CHECK-MTE-BTI-NEXT: entry: // CHECK-MTE-BTI-NEXT: ret i32 1 @@ -825,12 +825,12 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt // CHECK-MTE-BTI: resolver_else: // CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 -// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 +// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 4194304 +// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4194304 // CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK-MTE-BTI: resolver_return1: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._MpredresMrcpc +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._Mrcpc // CHECK-MTE-BTI: resolver_else2: // CHECK-MTE-BTI-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-MTE-BTI-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 769 diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c index a75514d63bce3..11655b2efcd84 100644 --- a/clang/test/CodeGen/attr-target-version.c +++ b/clang/test/CodeGen/attr-target-version.c @@ -5,14 +5,14 @@ int __attribute__((target_version("rng+flagm+fp16fml"))) fmv(void) { return 1; } int __attribute__((target_version("flagm2+sme-i16i64"))) fmv(void) { return 2; } int __attribute__((target_version("lse+sha2"))) fmv(void) { return 3; } -int __attribute__((target_version("dotprod+ls64"))) fmv(void) { return 4; } +int __attribute__((target_version("dotprod+wfxt"))) fmv(void) { return 4; } int __attribute__((target_version("fp16fml+memtag"))) fmv(void) { return 5; } int __attribute__((target_version("fp+aes"))) fmv(void) { return 6; } -int __attribute__((target_version("crc+ls64"))) fmv(void) { return 7; } +int __attribute__((target_version("crc+wfxt"))) fmv(void) { return 7; } int __attribute__((target_version("bti"))) fmv(void) { return 8; } int __attribute__((target_version("sme2"))) fmv(void) { return 9; } int __attribute__((target_version("default"))) fmv(void) { return 0; } -int __attribute__((target_version("ls64+simd"))) fmv_one(void) { return 1; } +int __attribute__((target_version("wfxt+simd"))) fmv_one(void) { return 1; } int __attribute__((target_version("dpb"))) fmv_one(void) { return 2; } int __attribute__((target_version("default"))) fmv_one(void) { return 0; } int __attribute__((target_version("fp"))) fmv_two(void) { return 1; } @@ -41,7 +41,7 @@ inline int __attribute__((target_version("fp+sm4"))) fmv_inline(void) { return 1 inline int __attribute__((target_version("lse+rdm"))) fmv_inline(void) { return 16; } inline int __attribute__((target_version("default"))) fmv_inline(void) { return 3; } -__attribute__((target_version("ls64"))) int fmv_e(void); +__attribute__((target_version("wfxt"))) int fmv_e(void); int fmv_e(void) { return 20; } static __attribute__((target_version("sb"))) inline int fmv_d(void); @@ -173,7 +173,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64 +// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMwfxt // CHECK-SAME: () #[[ATTR3:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 4 @@ -194,7 +194,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64 +// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMwfxt // CHECK-SAME: () #[[ATTR6:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 7 @@ -222,7 +222,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_one._Mls64Msimd +// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMwfxt // CHECK-SAME: () #[[ATTR10:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 @@ -479,20 +479,20 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret ptr @fmv._Mflagm2Msme-i16i64 // CHECK: resolver_else2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254742016 -// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254742016 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 18014398509483008 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 18014398509483008 // CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] // CHECK-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] // CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @fmv._McrcMls64 +// CHECK-NEXT: ret ptr @fmv._McrcMwfxt // CHECK: resolver_else4: // CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 9007199254741776 -// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 9007199254741776 +// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 18014398509482768 +// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 18014398509482768 // CHECK-NEXT: [[TMP15:%.*]] = and i1 true, [[TMP14]] // CHECK-NEXT: br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]] // CHECK: resolver_return5: -// CHECK-NEXT: ret ptr @fmv._MdotprodMls64 +// CHECK-NEXT: ret ptr @fmv._MdotprodMwfxt // CHECK: resolver_else6: // CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 1125899906842624 @@ -541,12 +541,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254741760 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254741760 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014398509482752 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014398509482752 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @fmv_one._Mls64Msimd +// CHECK-NEXT: ret ptr @fmv_one._MsimdMwfxt // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 262144 @@ -593,12 +593,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254740992 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254740992 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014398509481984 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014398509481984 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @fmv_e._Mls64 +// CHECK-NEXT: ret ptr @fmv_e._Mwfxt // CHECK: resolver_else: // CHECK-NEXT: ret ptr @fmv_e.default // diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp index a2cc9f30f026a..4f553262c73b5 100644 --- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp +++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --global-value-regex ".*" --version 5 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s -int __attribute__((target_clones("ls64+fp16", "default"))) foo_ovl(int) { return 1; } -int __attribute__((target_clones("fp16+ls64"))) foo_ovl(void) { return 2; } +int __attribute__((target_clones("fp16", "default"))) foo_ovl(int) { return 1; } +int __attribute__((target_clones("fp16"))) foo_ovl(void) { return 2; } int bar() { return foo_ovl(1) + foo_ovl(); @@ -45,7 +45,7 @@ void run_foo_tml() { // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver // CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver //. -// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli._Mfp16Mls64( +// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli._Mfp16( // CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 @@ -57,17 +57,17 @@ void run_foo_tml() { // CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254806784 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254806784 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 65792 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65792 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] // CHECK: [[RESOLVER_RETURN]]: -// CHECK-NEXT: ret ptr @_Z7foo_ovli._Mfp16Mls64 +// CHECK-NEXT: ret ptr @_Z7foo_ovli._Mfp16 // CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_Z7foo_ovli.default // // -// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv._Mfp16Mls64( +// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv._Mfp16( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: ret i32 2 @@ -77,12 +77,12 @@ void run_foo_tml() { // CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254806784 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254806784 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 65792 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65792 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] // CHECK: [[RESOLVER_RETURN]]: -// CHECK-NEXT: ret ptr @_Z7foo_ovlv._Mfp16Mls64 +// CHECK-NEXT: ret ptr @_Z7foo_ovlv._Mfp16 // CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_Z7foo_ovlv.default // diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c index 096d2f003a004..cfcc1622abe5c 100644 --- a/clang/test/Sema/attr-target-version.c +++ b/clang/test/Sema/attr-target-version.c @@ -78,7 +78,7 @@ void __attribute__((target_version("rdm+rng+crc"))) redef(void) {} int def(void); void __attribute__((target_version("dit"))) nodef(void); -void __attribute__((target_version("ls64"))) nodef(void); +void __attribute__((target_version("wfxt"))) nodef(void); void __attribute__((target_version("aes"))) ovl(void); void __attribute__((target_version("default"))) ovl(void); int bar() { diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc index 6b373ce424678..778f568c95c5e 100644 --- a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc +++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc @@ -69,13 +69,13 @@ enum CPUFeatures { FEAT_MEMTAG2, RESERVED_FEAT_MEMTAG3, // previously used and now ABI legacy FEAT_SB, - FEAT_PREDRES, - RESERVED_FEAT_SSBS, // previously used and now ABI legacy + RESERVED_FEAT_PREDRES, // previously used and now ABI legacy + RESERVED_FEAT_SSBS, // previously used and now ABI legacy FEAT_SSBS2, FEAT_BTI, - RESERVED_FEAT_LS64, // previously used and now ABI legacy - RESERVED_FEAT_LS64_V, // previously used and now ABI legacy - FEAT_LS64_ACCDATA, + RESERVED_FEAT_LS64, // previously used and now ABI legacy + RESERVED_FEAT_LS64_V, // previously used and now ABI legacy + RESERVED_FEAT_LS64_ACCDATA, // previously used and now ABI legacy FEAT_WFXT, FEAT_SME_F64, FEAT_SME_I64, diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc index 56ad3f8967b9a..d5c85701ad1a0 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc @@ -74,7 +74,6 @@ void __init_cpu_features_resolver(void) { CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE); CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2); CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL); - CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES); CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB); CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS); CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC); @@ -132,7 +131,6 @@ void __init_cpu_features_resolver(void) { {"hw.optional.arm.FEAT_I8MM", FEAT_I8MM}, {"hw.optional.arm.FEAT_BF16", FEAT_BF16}, {"hw.optional.arm.FEAT_SB", FEAT_SB}, - {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES}, {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2}, {"hw.optional.arm.FEAT_BTI", FEAT_BTI}, }; diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc index a3dbeb065403d..6d46fccdc79d9 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc @@ -81,17 +81,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_SME_F64); if (hwcap2 & HWCAP2_MOPS) setCPUFeature(FEAT_MOPS); - if (hwcap & HWCAP_CPUID) { - unsigned long ftr; - - getCPUFeature(ID_AA64ISAR1_EL1, ftr); - /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001 */ - if (extractBits(ftr, 40, 4) >= 0x1) - setCPUFeature(FEAT_PREDRES); - /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ - if (extractBits(ftr, 60, 4) >= 0x3) - setCPUFeature(FEAT_LS64_ACCDATA); - } if (hwcap & HWCAP_FP) { setCPUFeature(FEAT_FP); // FP and AdvSIMD fields have the same value diff --git a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc index 6b373ce424678..778f568c95c5e 100644 --- a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc +++ b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc @@ -69,13 +69,13 @@ enum CPUFeatures { FEAT_MEMTAG2, RESERVED_FEAT_MEMTAG3, // previously used and now ABI legacy FEAT_SB, - FEAT_PREDRES, - RESERVED_FEAT_SSBS, // previously used and now ABI legacy + RESERVED_FEAT_PREDRES, // previously used and now ABI legacy + RESERVED_FEAT_SSBS, // previously used and now ABI legacy FEAT_SSBS2, FEAT_BTI, - RESERVED_FEAT_LS64, // previously used and now ABI legacy - RESERVED_FEAT_LS64_V, // previously used and now ABI legacy - FEAT_LS64_ACCDATA, + RESERVED_FEAT_LS64, // previously used and now ABI legacy + RESERVED_FEAT_LS64_V, // previously used and now ABI legacy + RESERVED_FEAT_LS64_ACCDATA, // previously used and now ABI legacy FEAT_WFXT, FEAT_SME_F64, FEAT_SME_I64, diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td index e0f56fd555619..a9503b1e6248b 100644 --- a/llvm/lib/Target/AArch64/AArch64FMV.td +++ b/llvm/lib/Target/AArch64/AArch64FMV.td @@ -57,11 +57,9 @@ def : FMVExtension<"fp16fml", "FP16FML">; let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">; def : FMVExtension<"i8mm", "I8MM">; def : FMVExtension<"jscvt", "JSCVT">; -def : FMVExtension<"ls64", "LS64_ACCDATA">; def : FMVExtension<"lse", "LSE">; def : FMVExtension<"memtag", "MEMTAG2">; def : FMVExtension<"mops", "MOPS">; -def : FMVExtension<"predres", "PREDRES">; def : FMVExtension<"rcpc", "RCPC">; let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">; def : FMVExtension<"rcpc3", "RCPC3">; From 5daecd4a3b9c6cca10ab6d44f539adf7310ace23 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 24 Jan 2025 09:22:47 -0800 Subject: [PATCH 027/432] [Support] Fix namespace after #123990 https://llvm.org/docs/CodingStandards.html#use-namespace-qualifiers-to-implement-previously-declared-functions --- llvm/lib/Support/AArch64BuildAttributes.cpp | 34 +++++++++++---------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp index ada34eb3f927d..4a6b2fd538803 100644 --- a/llvm/lib/Support/AArch64BuildAttributes.cpp +++ b/llvm/lib/Support/AArch64BuildAttributes.cpp @@ -9,10 +9,10 @@ #include "llvm/Support/AArch64BuildAttributes.h" #include "llvm/ADT/StringSwitch.h" -namespace llvm { -namespace AArch64BuildAttributes { +using namespace llvm; +using namespace llvm::AArch64BuildAttributes; -StringRef getVendorName(unsigned Vendor) { +StringRef AArch64BuildAttributes::getVendorName(unsigned Vendor) { switch (Vendor) { case AEABI_FEATURE_AND_BITS: return "aeabi_feature_and_bits"; @@ -25,14 +25,14 @@ StringRef getVendorName(unsigned Vendor) { return ""; } } -VendorID getVendorID(StringRef Vendor) { +VendorID AArch64BuildAttributes::getVendorID(StringRef Vendor) { return StringSwitch(Vendor) .Case("aeabi_feature_and_bits", AEABI_FEATURE_AND_BITS) .Case("aeabi_pauthabi", AEABI_PAUTHABI) .Default(VENDOR_UNKNOWN); } -StringRef getOptionalStr(unsigned Optional) { +StringRef AArch64BuildAttributes::getOptionalStr(unsigned Optional) { switch (Optional) { case REQUIRED: return "required"; @@ -43,18 +43,18 @@ StringRef getOptionalStr(unsigned Optional) { return ""; } } -SubsectionOptional getOptionalID(StringRef Optional) { +SubsectionOptional AArch64BuildAttributes::getOptionalID(StringRef Optional) { return StringSwitch(Optional) .Case("required", REQUIRED) .Case("optional", OPTIONAL) .Default(OPTIONAL_NOT_FOUND); } -StringRef getSubsectionOptionalUnknownError() { +StringRef AArch64BuildAttributes::getSubsectionOptionalUnknownError() { return "unknown AArch64 build attributes optionality, expected " "required|optional"; } -StringRef getTypeStr(unsigned Type) { +StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) { switch (Type) { case ULEB128: return "uleb128"; @@ -65,17 +65,17 @@ StringRef getTypeStr(unsigned Type) { return ""; } } -SubsectionType getTypeID(StringRef Type) { +SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) { return StringSwitch(Type) .Cases("uleb128", "ULEB128", ULEB128) .Cases("ntbs", "NTBS", NTBS) .Default(TYPE_NOT_FOUND); } -StringRef getSubsectionTypeUnknownError() { +StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() { return "unknown AArch64 build attributes type, expected uleb128|ntbs"; } -StringRef getPauthABITagsStr(unsigned PauthABITag) { +StringRef AArch64BuildAttributes::getPauthABITagsStr(unsigned PauthABITag) { switch (PauthABITag) { case TAG_PAUTH_PLATFORM: return "Tag_PAuth_Platform"; @@ -86,14 +86,16 @@ StringRef getPauthABITagsStr(unsigned PauthABITag) { return ""; } } -PauthABITags getPauthABITagsID(StringRef PauthABITag) { + +PauthABITags AArch64BuildAttributes::getPauthABITagsID(StringRef PauthABITag) { return StringSwitch(PauthABITag) .Case("Tag_PAuth_Platform", TAG_PAUTH_PLATFORM) .Case("Tag_PAuth_Schema", TAG_PAUTH_SCHEMA) .Default(PAUTHABI_TAG_NOT_FOUND); } -StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) { +StringRef +AArch64BuildAttributes::getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) { switch (FeatureAndBitsTag) { case TAG_FEATURE_BTI: return "Tag_Feature_BTI"; @@ -106,12 +108,12 @@ StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) { return ""; } } -FeatureAndBitsTags getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) { + +FeatureAndBitsTags +AArch64BuildAttributes::getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) { return StringSwitch(FeatureAndBitsTag) .Case("Tag_Feature_BTI", TAG_FEATURE_BTI) .Case("Tag_Feature_PAC", TAG_FEATURE_PAC) .Case("Tag_Feature_GCS", TAG_FEATURE_GCS) .Default(FEATURE_AND_BITS_TAG_NOT_FOUND); } -} // namespace AArch64BuildAttributes -} // namespace llvm From c025b96ef9bb364c79f73fc3afb45c851c2efb17 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 24 Jan 2025 09:34:48 -0800 Subject: [PATCH 028/432] [ELF] Symbol::extract : remove unneeded file->lazy check --- lld/ELF/Symbols.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp index ce1e89f2d0801..497db842bc9be 100644 --- a/lld/ELF/Symbols.cpp +++ b/lld/ELF/Symbols.cpp @@ -254,10 +254,9 @@ void Symbol::parseSymbolVersion(Ctx &ctx) { } void Symbol::extract(Ctx &ctx) const { - if (file->lazy) { - file->lazy = false; - parseFile(ctx, file); - } + assert(file->lazy); + file->lazy = false; + parseFile(ctx, file); } uint8_t Symbol::computeBinding(Ctx &ctx) const { From 134401deea5e86d646bb99fab39c182cfa8e5292 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 24 Jan 2025 11:36:45 -0600 Subject: [PATCH 029/432] [Offload] Move RPC server handling to a dedicated thread (#112988) Summary: Handling the RPC server requires running through list of jobs that the device has requested to be done. Currently this is handled by the thread that does the waiting for the kernel to finish. However, this is not sound on NVIDIA architectures and only works for async launches in the OpenMP model that uses helper threads. However, we also don't want to have this thread doing work unnnecessarily. For this reason we track the execution of kernels and cause the thread to sleep via a condition variable (usually backed by some kind of futex or other intelligent sleeping mechanism) so that the thread will be idle while no kernels are running. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 59 +++++---- offload/plugins-nextgen/common/include/RPC.h | 80 +++++++++++- .../common/src/PluginInterface.cpp | 8 +- offload/plugins-nextgen/common/src/RPC.cpp | 123 +++++++++++++----- .../cuda/dynamic_cuda/cuda.cpp | 1 + .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 3 + offload/plugins-nextgen/cuda/src/rtl.cpp | 39 +++--- offload/test/libc/server.c | 56 ++++++++ 8 files changed, 281 insertions(+), 88 deletions(-) create mode 100644 offload/test/libc/server.c diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 7114dad020e3a..6fc75ac154289 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -621,9 +621,9 @@ struct AMDGPUSignalTy { } /// Wait until the signal gets a zero value. - Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr, + Error wait(const uint64_t ActiveTimeout = 0, GenericDeviceTy *Device = nullptr) const { - if (ActiveTimeout && !RPCServer) { + if (ActiveTimeout) { hsa_signal_value_t Got = 1; Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, ActiveTimeout, HSA_WAIT_STATE_ACTIVE); @@ -632,14 +632,11 @@ struct AMDGPUSignalTy { } // If there is an RPC device attached to this stream we run it as a server. - uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX; - auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED; + uint64_t Timeout = UINT64_MAX; + auto WaitState = HSA_WAIT_STATE_BLOCKED; while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, - Timeout, WaitState) != 0) { - if (RPCServer && Device) - if (auto Err = RPCServer->runServer(*Device)) - return Err; - } + Timeout, WaitState) != 0) + ; return Plugin::success(); } @@ -1052,11 +1049,6 @@ struct AMDGPUStreamTy { /// operation that was already finalized in a previous stream sycnhronize. uint32_t SyncCycle; - /// A pointer associated with an RPC server running on the given device. If - /// RPC is not being used this will be a null pointer. Otherwise, this - /// indicates that an RPC server is expected to be run on this stream. - RPCServerTy *RPCServer; - /// Mutex to protect stream's management. mutable std::mutex Mutex; @@ -1236,9 +1228,6 @@ struct AMDGPUStreamTy { /// Deinitialize the stream's signals. Error deinit() { return Plugin::success(); } - /// Attach an RPC server to this stream. - void setRPCServer(RPCServerTy *Server) { RPCServer = Server; } - /// Push a asynchronous kernel to the stream. The kernel arguments must be /// placed in a special allocation for kernel args and must keep alive until /// the kernel finalizes. Once the kernel is finished, the stream will release @@ -1266,10 +1255,30 @@ struct AMDGPUStreamTy { if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager)) return Err; + // If we are running an RPC server we want to wake up the server thread + // whenever there is a kernel running and let it sleep otherwise. + if (Device.getRPCServer()) + Device.Plugin.getRPCServer().Thread->notify(); + // Push the kernel with the output signal and an input signal (optional) - return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks, - GroupSize, StackSize, OutputSignal, - InputSignal); + if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, + NumBlocks, GroupSize, StackSize, + OutputSignal, InputSignal)) + return Err; + + // Register a callback to indicate when the kernel is complete. + if (Device.getRPCServer()) { + if (auto Err = Slots[Curr].schedCallback( + [](void *Data) -> llvm::Error { + GenericPluginTy &Plugin = + *reinterpret_cast(Data); + Plugin.getRPCServer().Thread->finish(); + return Error::success(); + }, + &Device.Plugin)) + return Err; + } + return Plugin::success(); } /// Push an asynchronous memory copy between pinned memory buffers. @@ -1479,8 +1488,8 @@ struct AMDGPUStreamTy { return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, - RPCServer, &Device)) + if (auto Err = + Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device)) return Err; // Reset the stream and perform all pending post actions. @@ -3027,7 +3036,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) : Agent(Device.getAgent()), Queue(nullptr), SignalManager(Device.getSignalManager()), Device(Device), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr), + Slots(32), NextSlot(0), SyncCycle(0), StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} @@ -3383,10 +3392,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) return Err; - // If this kernel requires an RPC server we attach its pointer to the stream. - if (GenericDevice.getRPCServer()) - Stream->setRPCServer(GenericDevice.getRPCServer()); - // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. if (ImplArgs && getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) { diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h index 5b9b7ffd086b5..f3a8e7555020d 100644 --- a/offload/plugins-nextgen/common/include/RPC.h +++ b/offload/plugins-nextgen/common/include/RPC.h @@ -19,7 +19,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/Support/Error.h" +#include +#include #include +#include +#include namespace llvm::omp::target { namespace plugin { @@ -37,6 +41,12 @@ struct RPCServerTy { /// Initializes the handles to the number of devices we may need to service. RPCServerTy(plugin::GenericPluginTy &Plugin); + /// Deinitialize the associated memory and resources. + llvm::Error shutDown(); + + /// Initialize the worker thread. + llvm::Error startThread(); + /// Check if this device image is using an RPC server. This checks for the /// precense of an externally visible symbol in the device image that will /// be present whenever RPC code is called. @@ -51,17 +61,77 @@ struct RPCServerTy { plugin::GenericGlobalHandlerTy &Handler, plugin::DeviceImageTy &Image); - /// Runs the RPC server associated with the \p Device until the pending work - /// is cleared. - llvm::Error runServer(plugin::GenericDeviceTy &Device); - /// Deinitialize the RPC server for the given device. This will free the /// memory associated with the k llvm::Error deinitDevice(plugin::GenericDeviceTy &Device); private: /// Array from this device's identifier to its attached devices. - llvm::SmallVector Buffers; + std::unique_ptr Buffers; + + /// Array of associated devices. These must be alive as long as the server is. + std::unique_ptr Devices; + + /// A helper class for running the user thread that handles the RPC interface. + /// Because we only need to check the RPC server while any kernels are + /// working, we track submission / completion events to allow the thread to + /// sleep when it is not needed. + struct ServerThread { + std::thread Worker; + + /// A boolean indicating whether or not the worker thread should continue. + std::atomic Running; + + /// The number of currently executing kernels across all devices that need + /// the server thread to be running. + std::atomic NumUsers; + + /// The condition variable used to suspend the thread if no work is needed. + std::condition_variable CV; + std::mutex Mutex; + + /// A reference to all the RPC interfaces that the server is handling. + llvm::ArrayRef Buffers; + + /// A reference to the associated generic device for the buffer. + llvm::ArrayRef Devices; + + /// Initialize the worker thread to run in the background. + ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[], + size_t Length) + : Running(true), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length), + Devices(Devices, Length) {} + + ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); } + + /// Notify the worker thread that there is a user that needs it. + void notify() { + std::lock_guard Lock(Mutex); + NumUsers.fetch_add(1, std::memory_order_relaxed); + CV.notify_all(); + } + + /// Indicate that one of the dependent users has finished. + void finish() { + [[maybe_unused]] uint32_t Old = + NumUsers.fetch_sub(1, std::memory_order_relaxed); + assert(Old > 0 && "Attempt to signal finish with no pending work"); + } + + /// Destroy the worker thread and wait. + void shutDown(); + + /// Initialize the worker thread. + void startThread(); + + /// Run the server thread to continuously check the RPC interface for work + /// to be done for the device. + void run(); + }; + +public: + /// Pointer to the server thread instance. + std::unique_ptr Thread; }; } // namespace llvm::omp::target diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index a164bfb51d026..c9acabea6977d 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1057,6 +1057,9 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image)) return Err; + if (auto Err = Server.startThread()) + return Err; + RPCServer = &Server; DP("Running an RPC server on device %d\n", getDeviceId()); return Plugin::success(); @@ -1630,8 +1633,11 @@ Error GenericPluginTy::deinit() { if (GlobalHandler) delete GlobalHandler; - if (RPCServer) + if (RPCServer) { + if (Error Err = RPCServer->shutDown()) + return Err; delete RPCServer; + } if (RecordReplay) delete RecordReplay; diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp index f20c8f7bcc5c9..81ad9ca66808a 100644 --- a/offload/plugins-nextgen/common/src/RPC.cpp +++ b/offload/plugins-nextgen/common/src/RPC.cpp @@ -21,8 +21,8 @@ using namespace omp; using namespace target; template -rpc::Status handle_offload_opcodes(plugin::GenericDeviceTy &Device, - rpc::Server::Port &Port) { +rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device, + rpc::Server::Port &Port) { switch (Port.get_opcode()) { case LIBC_MALLOC: { @@ -62,21 +62,99 @@ rpc::Status handle_offload_opcodes(plugin::GenericDeviceTy &Device, return rpc::RPC_SUCCESS; } -static rpc::Status handle_offload_opcodes(plugin::GenericDeviceTy &Device, - rpc::Server::Port &Port, - uint32_t NumLanes) { +static rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device, + rpc::Server::Port &Port, + uint32_t NumLanes) { if (NumLanes == 1) - return handle_offload_opcodes<1>(Device, Port); + return handleOffloadOpcodes<1>(Device, Port); else if (NumLanes == 32) - return handle_offload_opcodes<32>(Device, Port); + return handleOffloadOpcodes<32>(Device, Port); else if (NumLanes == 64) - return handle_offload_opcodes<64>(Device, Port); + return handleOffloadOpcodes<64>(Device, Port); else return rpc::RPC_ERROR; } +static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) { + uint64_t NumPorts = + std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT); + rpc::Server Server(NumPorts, Buffer); + + auto Port = Server.try_open(Device.getWarpSize()); + if (!Port) + return rpc::RPC_SUCCESS; + + rpc::Status Status = + handleOffloadOpcodes(Device, *Port, Device.getWarpSize()); + + // Let the `libc` library handle any other unhandled opcodes. +#ifdef LIBOMPTARGET_RPC_SUPPORT + if (Status == rpc::RPC_UNHANDLED_OPCODE) + Status = handle_libc_opcodes(*Port, Device.getWarpSize()); +#endif + + Port->close(); + + return Status; +} + +void RPCServerTy::ServerThread::startThread() { + Worker = std::thread([this]() { run(); }); +} + +void RPCServerTy::ServerThread::shutDown() { + { + std::lock_guard Lock(Mutex); + Running.store(false, std::memory_order_release); + CV.notify_all(); + } + if (Worker.joinable()) + Worker.join(); +} + +void RPCServerTy::ServerThread::run() { + std::unique_lock Lock(Mutex); + for (;;) { + CV.wait(Lock, [&]() { + return NumUsers.load(std::memory_order_acquire) > 0 || + !Running.load(std::memory_order_acquire); + }); + + if (!Running.load(std::memory_order_acquire)) + return; + + Lock.unlock(); + while (NumUsers.load(std::memory_order_relaxed) > 0 && + Running.load(std::memory_order_relaxed)) { + for (const auto &[Buffer, Device] : llvm::zip_equal(Buffers, Devices)) { + if (!Buffer || !Device) + continue; + + // If running the server failed, print a message but keep running. + if (runServer(*Device, Buffer) != rpc::RPC_SUCCESS) + FAILURE_MESSAGE("Unhandled or invalid RPC opcode!"); + } + } + Lock.lock(); + } +} + RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin) - : Buffers(Plugin.getNumDevices()) {} + : Buffers(std::make_unique(Plugin.getNumDevices())), + Devices(std::make_unique( + Plugin.getNumDevices())), + Thread(new ServerThread(Buffers.get(), Devices.get(), + Plugin.getNumDevices())) {} + +llvm::Error RPCServerTy::startThread() { + Thread->startThread(); + return Error::success(); +} + +llvm::Error RPCServerTy::shutDown() { + Thread->shutDown(); + return Error::success(); +} llvm::Expected RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device, @@ -108,35 +186,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, sizeof(rpc::Client), nullptr)) return Err; Buffers[Device.getDeviceId()] = RPCBuffer; - - return Error::success(); -} - -Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) { - uint64_t NumPorts = - std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT); - rpc::Server Server(NumPorts, Buffers[Device.getDeviceId()]); - - auto Port = Server.try_open(Device.getWarpSize()); - if (!Port) - return Error::success(); - - int Status = handle_offload_opcodes(Device, *Port, Device.getWarpSize()); - - // Let the `libc` library handle any other unhandled opcodes. -#ifdef LIBOMPTARGET_RPC_SUPPORT - if (Status == rpc::RPC_UNHANDLED_OPCODE) - Status = handle_libc_opcodes(*Port, Device.getWarpSize()); -#endif - - Port->close(); - if (Status != rpc::RPC_SUCCESS) - return createStringError("RPC server given invalid opcode!"); + Devices[Device.getDeviceId()] = &Device; return Error::success(); } Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST); + Buffers[Device.getDeviceId()] = nullptr; + Devices[Device.getDeviceId()] = nullptr; return Error::success(); } diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 5ec3adb9e4e3a..7878499dbfcb7 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -63,6 +63,7 @@ DLWRAP(cuStreamCreate, 2) DLWRAP(cuStreamDestroy, 1) DLWRAP(cuStreamSynchronize, 1) DLWRAP(cuStreamQuery, 1) +DLWRAP(cuStreamAddCallback, 4) DLWRAP(cuCtxSetCurrent, 1) DLWRAP(cuDevicePrimaryCtxRelease, 1) DLWRAP(cuDevicePrimaryCtxGetState, 3) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index 16c8f7ad46c44..ad874735a25ed 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -286,6 +286,8 @@ static inline void *CU_LAUNCH_PARAM_END = (void *)0x00; static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01; static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02; +typedef void (*CUstreamCallback)(CUstream, CUresult, void *); + CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); @@ -326,6 +328,7 @@ CUresult cuStreamCreate(CUstream *, unsigned); CUresult cuStreamDestroy(CUstream); CUresult cuStreamSynchronize(CUstream); CUresult cuStreamQuery(CUstream); +CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int); CUresult cuCtxSetCurrent(CUcontext); CUresult cuDevicePrimaryCtxRelease(CUdevice); CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 894d1c2214b97..52e8a100dc87b 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -628,17 +628,7 @@ struct CUDADeviceTy : public GenericDeviceTy { Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { CUstream Stream = reinterpret_cast(AsyncInfo.Queue); CUresult Res; - // If we have an RPC server running on this device we will continuously - // query it for work rather than blocking. - if (!getRPCServer()) { - Res = cuStreamSynchronize(Stream); - } else { - do { - Res = cuStreamQuery(Stream); - if (auto Err = getRPCServer()->runServer(*this)) - return Err; - } while (Res == CUDA_ERROR_NOT_READY); - } + Res = cuStreamSynchronize(Stream); // Once the stream is synchronized, return it to stream pool and reset // AsyncInfo. This is to make sure the synchronization only works for its @@ -823,17 +813,6 @@ struct CUDADeviceTy : public GenericDeviceTy { if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; - // If there is already pending work on the stream it could be waiting for - // someone to check the RPC server. - if (auto *RPCServer = getRPCServer()) { - CUresult Res = cuStreamQuery(Stream); - while (Res == CUDA_ERROR_NOT_READY) { - if (auto Err = RPCServer->runServer(*this)) - return Err; - Res = cuStreamQuery(Stream); - } - } - CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s"); } @@ -1292,9 +1271,25 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, reinterpret_cast(&LaunchParams.Size), CU_LAUNCH_PARAM_END}; + // If we are running an RPC server we want to wake up the server thread + // whenever there is a kernel running and let it sleep otherwise. + if (GenericDevice.getRPCServer()) + GenericDevice.Plugin.getRPCServer().Thread->notify(); + CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0], NumThreads[1], NumThreads[2], MaxDynCGroupMem, Stream, nullptr, Config); + + // Register a callback to indicate when the kernel is complete. + if (GenericDevice.getRPCServer()) + cuLaunchHostFunc( + Stream, + [](void *Data) { + GenericPluginTy &Plugin = *reinterpret_cast(Data); + Plugin.getRPCServer().Thread->finish(); + }, + &GenericDevice.Plugin); + return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName()); } diff --git a/offload/test/libc/server.c b/offload/test/libc/server.c new file mode 100644 index 0000000000000..67f60a648235a --- /dev/null +++ b/offload/test/libc/server.c @@ -0,0 +1,56 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: libc + +#include +#include +#include + +#pragma omp begin declare variant match(device = {kind(gpu)}) +// Extension provided by the 'libc' project. +unsigned long long __llvm_omp_host_call(void *fn, void *args, size_t size); +#pragma omp declare target to(__llvm_omp_host_call) device_type(nohost) +#pragma omp end declare variant + +#pragma omp begin declare variant match(device = {kind(cpu)}) +// Dummy host implementation to make this work for all targets. +unsigned long long __llvm_omp_host_call(void *fn, void *args, size_t size) { + return ((unsigned long long (*)(void *))fn)(args); +} +#pragma omp end declare variant + +long long foo(void *data) { return -1; } + +void *fn_ptr = NULL; +#pragma omp declare target to(fn_ptr) + +int main() { + fn_ptr = (void *)&foo; +#pragma omp target update to(fn_ptr) + + for (int i = 0; i < 4; ++i) { +#pragma omp target + { + long long res = __llvm_omp_host_call(fn_ptr, NULL, 0); + assert(res == -1 && "RPC call failed\n"); + } + + for (int j = 0; j < 128; ++j) { +#pragma omp target nowait + { + long long res = __llvm_omp_host_call(fn_ptr, NULL, 0); + assert(res == -1 && "RPC call failed\n"); + } + } +#pragma omp taskwait + +#pragma omp target + { + long long res = __llvm_omp_host_call(fn_ptr, NULL, 0); + assert(res == -1 && "RPC call failed\n"); + } + } + + // CHECK: PASS + puts("PASS"); +} From bd8a8181288c9e16eb90fff78cbbc63b4687963a Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 24 Jan 2025 11:40:15 -0600 Subject: [PATCH 030/432] [Offload] Add cuLaunchHostFunc to dynamic cuda Summary: This was missing, causing non-directly linked builds to fail. --- offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp | 1 + offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 7878499dbfcb7..e5332686fcffb 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -39,6 +39,7 @@ DLWRAP(cuDriverGetVersion, 1) DLWRAP(cuGetErrorString, 2) DLWRAP(cuLaunchKernel, 11) +DLWRAP(cuLaunchHostFunc, 3) DLWRAP(cuMemAlloc, 2) DLWRAP(cuMemAllocHost, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index ad874735a25ed..ac075c875a8bb 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -30,15 +30,16 @@ typedef uintptr_t CUdeviceptr; typedef struct CUmod_st *CUmodule; typedef struct CUctx_st *CUcontext; typedef struct CUfunc_st *CUfunction; +typedef void (*CUhostFn)(void *userData); typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; -#define CU_DEVICE_INVALID ((CUdevice)-2) +#define CU_DEVICE_INVALID ((CUdevice)(-2)) typedef unsigned long long CUmemGenericAllocationHandle_v1; typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; -#define CU_DEVICE_INVALID ((CUdevice)-2) +#define CU_DEVICE_INVALID ((CUdevice)(-2)) typedef enum CUmemAllocationGranularity_flags_enum { CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, @@ -304,6 +305,7 @@ CUresult cuInit(unsigned); CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, CUstream, void **, void **); +CUresult cuLaunchHostFunc(CUstream, CUhostFn, void *); CUresult cuMemAlloc(CUdeviceptr *, size_t); CUresult cuMemAllocHost(void **, size_t); From 0cd794d4860e376698bb4da24bcdf8cbf331835c Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 24 Jan 2025 18:56:02 +0100 Subject: [PATCH 031/432] [libc++][chrono] implements UTC clock. (#90393) While implementing this feature and its associated LWG issues it turns out - LWG3316 Correctly define epoch for utc_clock / utc_timepoint only added non-normative wording to the standard. Implements parts of: - P0355 Extending to Calendars and Time Zones - P1361 Integration of chrono with text formatting - LWG3359 leap second support should allow for negative leap seconds --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/docs/Status/FormatPaper.csv | 2 +- libcxx/include/CMakeLists.txt | 1 + libcxx/include/__chrono/convert_to_tm.h | 23 + libcxx/include/__chrono/formatter.h | 18 + libcxx/include/__chrono/ostream.h | 13 + libcxx/include/__chrono/utc_clock.h | 163 +++ libcxx/include/chrono | 38 + libcxx/include/module.modulemap | 4 + libcxx/modules/std/chrono.inc | 11 +- libcxx/test/benchmarks/utc_clock.bench.cpp | 60 + .../diagnostics/chrono.nodiscard.verify.cpp | 14 + .../get_leap_second_info.pass.cpp | 147 +++ .../time.clock.utc.members/from_sys.pass.cpp | 108 ++ .../time.clock.utc.members/to_sys.pass.cpp | 117 ++ .../get_leap_second_info.pass.cpp | 128 +++ .../leap_second_info.members.pass.cpp | 37 + .../time.clock.utc.members/from_sys.pass.cpp | 245 ++++ .../time.clock.utc.members/now.pass.cpp | 38 + .../time.clock.utc.members/to_sys.pass.cpp | 252 +++++ .../time.clock.utc/types.compile.pass.cpp | 60 + .../time.clock.utc/utc_time.ostream.pass.cpp | 165 +++ .../time/time.syn/formatter.utc_time.pass.cpp | 1004 +++++++++++++++++ .../concept.formattable.compile.pass.cpp | 8 +- 24 files changed, 2653 insertions(+), 5 deletions(-) create mode 100644 libcxx/include/__chrono/utc_clock.h create mode 100644 libcxx/test/benchmarks/utc_clock.bench.cpp create mode 100644 libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp create mode 100644 libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp create mode 100644 libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp create mode 100644 libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 3462557e8d668..ca286146840b1 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -238,7 +238,7 @@ "`LWG3313 `__","``join_view::iterator::operator--``\ is incorrectly constrained","2020-02 (Prague)","|Complete|","14","" "`LWG3314 `__","Is stream insertion behavior locale dependent when ``Period::type``\ is ``micro``\ ?","2020-02 (Prague)","|Complete|","16","" "`LWG3315 `__","LWG3315: Correct Allocator Default Behavior","2020-02 (Prague)","|Complete|","","" -"`LWG3316 `__","Correctly define epoch for ``utc_clock``\ / ``utc_timepoint``\ ","2020-02 (Prague)","","","" +"`LWG3316 `__","Correctly define epoch for ``utc_clock``\ / ``utc_timepoint``\ ","2020-02 (Prague)","|Nothing To Do|","","" "`LWG3317 `__","Incorrect ``operator<<``\ for floating-point durations","2020-02 (Prague)","|Complete|","16","" "`LWG3318 `__","Clarify whether clocks can represent time before their epoch","2020-02 (Prague)","","","" "`LWG3319 `__","Properly reference specification of IANA time zone database","2020-02 (Prague)","|Nothing To Do|","","" diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv index 7f5f46d834d3e..343fa62f13565 100644 --- a/libcxx/docs/Status/FormatPaper.csv +++ b/libcxx/docs/Status/FormatPaper.csv @@ -2,7 +2,7 @@ Section,Description,Dependencies,Assignee,Status,First released version `P1361 `__ `P2372 `__,"Formatting chrono" `[time.syn] `_,"Formatter ``chrono::duration``",,Mark de Wever,|Complete|,16 `[time.syn] `_,"Formatter ``chrono::sys_time``",,Mark de Wever,|Complete|,17 -`[time.syn] `_,"Formatter ``chrono::utc_time``",A ```` implementation,Mark de Wever,,, +`[time.syn] `_,"Formatter ``chrono::utc_time``",A ```` implementation,Mark de Wever,|Complete|,20 `[time.syn] `_,"Formatter ``chrono::tai_time``",A ```` implementation,Mark de Wever,,, `[time.syn] `_,"Formatter ``chrono::gps_time``",A ```` implementation,Mark de Wever,,, `[time.syn] `_,"Formatter ``chrono::file_time``",,Mark de Wever,|Complete|,17 diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index f3313bf53460a..78d3192542b5a 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -275,6 +275,7 @@ set(files __chrono/time_zone_link.h __chrono/tzdb.h __chrono/tzdb_list.h + __chrono/utc_clock.h __chrono/weekday.h __chrono/year.h __chrono/year_month.h diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h index 8a16c4f996a86..e547e107a5852 100644 --- a/libcxx/include/__chrono/convert_to_tm.h +++ b/libcxx/include/__chrono/convert_to_tm.h @@ -24,6 +24,7 @@ #include <__chrono/sys_info.h> #include <__chrono/system_clock.h> #include <__chrono/time_point.h> +#include <__chrono/utc_clock.h> #include <__chrono/weekday.h> #include <__chrono/year.h> #include <__chrono/year_month.h> @@ -98,6 +99,22 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const chrono::sys_time<_Duration> __tp return __result; } +# if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) + +template +_LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(chrono::utc_time<_Duration> __tp) { + _Tm __result = std::__convert_to_tm<_Tm>(chrono::utc_clock::to_sys(__tp)); + + if (chrono::get_leap_second_info(__tp).is_leap_second) + ++__result.tm_sec; + + return __result; +} + +# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION + // Convert a chrono (calendar) time point, or dururation to the given _Tm type, // which must have the same properties as std::tm. template @@ -110,6 +127,12 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) { if constexpr (__is_time_point<_ChronoT>) { if constexpr (same_as) return std::__convert_to_tm<_Tm>(__value); +# if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) + else if constexpr (same_as) + return std::__convert_to_tm<_Tm>(__value); +# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION else if constexpr (same_as) return std::__convert_to_tm<_Tm>(_ChronoT::clock::to_sys(__value)); else if constexpr (same_as) diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index 400eb8c7fdcd2..6153fdc35a47b 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -32,6 +32,7 @@ # include <__chrono/sys_info.h> # include <__chrono/system_clock.h> # include <__chrono/time_point.h> +# include <__chrono/utc_clock.h> # include <__chrono/weekday.h> # include <__chrono/year.h> # include <__chrono/year_month.h> @@ -719,6 +720,23 @@ struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : pub } }; +# if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) + +template +struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : public __formatter_chrono<_CharT> { +public: + using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>; + + template + _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) { + return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__clock); + } +}; + +# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM + template struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : public __formatter_chrono<_CharT> { public: diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h index 41884647f927d..66735e5c2c28b 100644 --- a/libcxx/include/__chrono/ostream.h +++ b/libcxx/include/__chrono/ostream.h @@ -26,6 +26,7 @@ # include <__chrono/statically_widen.h> # include <__chrono/sys_info.h> # include <__chrono/system_clock.h> +# include <__chrono/utc_clock.h> # include <__chrono/weekday.h> # include <__chrono/year.h> # include <__chrono/year_month.h> @@ -61,6 +62,18 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_days& __dp) { return __os << year_month_day{__dp}; } +# if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) + +template +_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& +operator<<(basic_ostream<_CharT, _Traits>& __os, const utc_time<_Duration>& __tp) { + return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%F %T}"), __tp); +} + +# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM + template _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, const file_time<_Duration> __tp) { diff --git a/libcxx/include/__chrono/utc_clock.h b/libcxx/include/__chrono/utc_clock.h new file mode 100644 index 0000000000000..647b6eda13ea2 --- /dev/null +++ b/libcxx/include/__chrono/utc_clock.h @@ -0,0 +1,163 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CHRONO_UTC_CLOCK_H +#define _LIBCPP___CHRONO_UTC_CLOCK_H + +#include +// Enable the contents of the header only when libc++ was built with experimental features enabled. +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) + +# include <__chrono/duration.h> +# include <__chrono/leap_second.h> +# include <__chrono/system_clock.h> +# include <__chrono/time_point.h> +# include <__chrono/tzdb.h> +# include <__chrono/tzdb_list.h> +# include <__config> +# include <__type_traits/common_type.h> + +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +# if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION + +namespace chrono { + +class utc_clock; + +template +using utc_time = time_point; +using utc_seconds = utc_time; + +class utc_clock { +public: + using rep = system_clock::rep; + using period = system_clock::period; + using duration = chrono::duration; + using time_point = chrono::time_point; + static constexpr bool is_steady = false; // The system_clock is not steady. + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static time_point now() { return from_sys(system_clock::now()); } + + template + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static sys_time> + to_sys(const utc_time<_Duration>& __time); + + template + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static utc_time> + from_sys(const sys_time<_Duration>& __time) { + using _Rp = utc_time>; + // TODO TZDB investigate optimizations. + // + // The leap second database stores all transitions, this mean to calculate + // the current number of leap seconds the code needs to iterate over all + // leap seconds to accumulate the sum. Then the sum can be used to determine + // the sys_time. Accessing the database involves acquiring a mutex. + // + // The historic entries in the database are immutable. Hard-coding these + // values in a table would allow: + // - To store the sum, allowing a binary search on the data. + // - Avoid acquiring a mutex. + // The disadvantage are: + // - A slightly larger code size. + // + // There are two optimization directions + // - hard-code the database and do a linear search for future entries. This + // search can start at the back, and should probably contain very few + // entries. (Adding leap seconds is quite rare and new release of libc++ + // can add the new entries; they are announced half a year before they are + // added.) + // - During parsing the leap seconds store an additional database in the + // dylib with the list of the sum of the leap seconds. In that case there + // can be a private function __get_utc_to_sys_table that returns the + // table. + // + // Note for to_sys there are no optimizations to be done; it uses + // get_leap_second_info. The function get_leap_second_info could benefit + // from optimizations as described above; again both options apply. + + // Both UTC and the system clock use the same epoch. The Standard + // specifies from 1970-01-01 even when UTC starts at + // 1972-01-01 00:00:10 TAI. So when the sys_time is before epoch we can be + // sure there both clocks return the same value. + + const tzdb& __tzdb = chrono::get_tzdb(); + _Rp __result{__time.time_since_epoch()}; + for (const auto& __leap_second : __tzdb.leap_seconds) { + if (__leap_second > __time) + return __result; + + __result += __leap_second.value(); + } + return __result; + } +}; + +struct leap_second_info { + bool is_leap_second; + seconds elapsed; +}; + +template +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI leap_second_info get_leap_second_info(const utc_time<_Duration>& __time) { + const tzdb& __tzdb = chrono::get_tzdb(); + if (__tzdb.leap_seconds.empty()) [[unlikely]] + return {false, chrono::seconds{0}}; + + sys_seconds __sys{chrono::floor(__time).time_since_epoch()}; + seconds __elapsed{0}; + for (const auto& __leap_second : __tzdb.leap_seconds) { + if (__sys == __leap_second.date() + __elapsed) + // A time point may only be a leap second during a positive leap second + // insertion, since time points that occur during a (theoretical) + // negative leap second don't exist. + return {__leap_second.value() > 0s, __elapsed + __leap_second.value()}; + + if (__sys < __leap_second.date() + __elapsed) + return {false, __elapsed}; + + __elapsed += __leap_second.value(); + } + + return {false, __elapsed}; +} + +template +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI sys_time> +utc_clock::to_sys(const utc_time<_Duration>& __time) { + using _Dp = common_type_t<_Duration, seconds>; + leap_second_info __info = chrono::get_leap_second_info(__time); + + // [time.clock.utc.members]/2 + // Returns: A sys_time t, such that from_sys(t) == u if such a mapping + // exists. Otherwise u represents a time_point during a positive leap + // second insertion, the conversion counts that leap second as not + // inserted, and the last representable value of sys_time prior to the + // insertion of the leap second is returned. + sys_time> __result{__time.time_since_epoch() - __info.elapsed}; + if (__info.is_leap_second) + return chrono::floor(__result) + chrono::seconds{1} - _Dp{1}; + + return __result; +} + +} // namespace chrono + +# endif // _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && + // _LIBCPP_HAS_LOCALIZATION + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) + +#endif // _LIBCPP___CHRONO_UTC_CLOCK_H diff --git a/libcxx/include/chrono b/libcxx/include/chrono index d9a8afef933b9..10695eea649fb 100644 --- a/libcxx/include/chrono +++ b/libcxx/include/chrono @@ -300,6 +300,41 @@ template // C++20 basic_ostream& operator<<(basic_ostream& os, const sys_days& dp); +// [time.clock.utc], class utc_clock +class utc_clock { // C++20 +public: + using rep = a signed arithmetic type; + using period = ratio; + using duration = chrono::duration; + using time_point = chrono::time_point; + static constexpr bool is_steady = unspecified; + + static time_point now(); + + template + static sys_time> + to_sys(const utc_time& t); + template + static utc_time> + from_sys(const sys_time& t); +}; + +template +using utc_time = time_point; // C++20 +using utc_seconds = utc_time; // C++20 + +template // C++20 + basic_ostream& + operator<<(basic_ostream& os, const utc_time& t); + +struct leap_second_info { // C++20 + bool is_leap_second; + seconds elapsed; +}; + +template // C++20 + leap_second_info get_leap_second_info(const utc_time& ut); + class file_clock // C++20 { public: @@ -861,6 +896,8 @@ strong_ordering operator<=>(const time_zone_link& x, const time_zone_link& y); namespace std { template struct formatter, charT>; // C++20 + template + struct formatter, charT>; // C++20 template struct formatter, charT>; // C++20 template @@ -981,6 +1018,7 @@ constexpr chrono::year operator ""y(unsigned lo # include <__chrono/time_zone_link.h> # include <__chrono/tzdb.h> # include <__chrono/tzdb_list.h> +# include <__chrono/utc_clock.h> # include <__chrono/zoned_time.h> # endif diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index cdac9c883ecab..85b88ca137f85 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -980,6 +980,10 @@ module std [system] { export std.string // public data member of type std::string export std.vector // public data members of type std::vector } + module utc_clock { + header "__chrono/utc_clock.h" + export std.chrono.time_point + } module weekday { header "__chrono/weekday.h" } module year_month_day { header "__chrono/year_month_day.h" } module year_month_weekday { header "__chrono/year_month_weekday.h" } diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc index df21d1fbac585..98f14f716c207 100644 --- a/libcxx/modules/std/chrono.inc +++ b/libcxx/modules/std/chrono.inc @@ -84,7 +84,9 @@ export namespace std { using std::chrono::sys_seconds; using std::chrono::sys_time; -#if 0 +#if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION +# ifdef _LIBCPP_ENABLE_EXPERIMENTAL + // [time.clock.utc], class utc_clock using std::chrono::utc_clock; @@ -94,6 +96,8 @@ export namespace std { using std::chrono::leap_second_info; using std::chrono::get_leap_second_info; + +# if 0 // [time.clock.tai], class tai_clock using std::chrono::tai_clock; @@ -105,7 +109,10 @@ export namespace std { using std::chrono::gps_seconds; using std::chrono::gps_time; -#endif +# endif +# endif // _LIBCPP_ENABLE_EXPERIMENTAL +#endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION + // [time.clock.file], type file_clock using std::chrono::file_clock; diff --git a/libcxx/test/benchmarks/utc_clock.bench.cpp b/libcxx/test/benchmarks/utc_clock.bench.cpp new file mode 100644 index 0000000000000..c44652a8f7ae0 --- /dev/null +++ b/libcxx/test/benchmarks/utc_clock.bench.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +#include + +#include "benchmark/benchmark.h" + +// Benchmarks the performance of the UTC <-> system time conversions. These +// operations determine the sum of leap second insertions at a specific time. + +static void BM_from_sys(benchmark::State& state) { + std::chrono::sys_days date{std::chrono::July / 1 / state.range(0)}; + for (auto _ : state) + benchmark::DoNotOptimize(std::chrono::utc_clock::from_sys(date)); +} + +BENCHMARK(BM_from_sys) + ->Arg(1970) // before the first leap seconds + ->Arg(1979) // in the first half of inserted leap seconds + ->Arg(1993) // in the second half of inserted leap seconds + ->Arg(2100); // after the last leap second + +BENCHMARK(BM_from_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(4); +BENCHMARK(BM_from_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(16); + +static void BM_to_sys(benchmark::State& state) { + // 59 sec offset means we pass th UTC offset for the leap second; assuming + // there won't be more than 59 leap seconds ever. + std::chrono::utc_seconds date{ + std::chrono::sys_days{std::chrono::July / 1 / state.range(0)}.time_since_epoch() + std::chrono::seconds{59}}; + for (auto _ : state) + benchmark::DoNotOptimize(std::chrono::utc_clock::to_sys(date)); +} + +BENCHMARK(BM_to_sys) + ->Arg(1970) // before the first leap seconds + ->Arg(1979) // in the first half of inserted leap seconds + ->Arg(1993) // in the second half of inserted leap seconds + ->Arg(2100); // after the last leap second + +BENCHMARK(BM_to_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(4); +BENCHMARK(BM_to_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(16); + +int main(int argc, char** argv) { + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) + return 1; + + benchmark::RunSpecifiedBenchmarks(); +} diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp index 4cb10ae3c35e9..644c5b598c018 100644 --- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp +++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp @@ -75,6 +75,20 @@ void test(std::chrono::time_zone tz, std::chrono::time_zone_link link, std::chro t::locate_zone(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} } + { // [time.clock.utc] + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::chrono::utc_clock::now(); + + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::chrono::utc_clock::to_sys(std::chrono::utc_seconds{}); + + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::chrono::utc_clock::from_sys(std::chrono::sys_seconds{}); + + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::chrono::get_leap_second_info(std::chrono::utc_seconds{}); + } + { std::chrono::zoned_time zt; diff --git a/libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp b/libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp new file mode 100644 index 0000000000000..e87c5438179ef --- /dev/null +++ b/libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp @@ -0,0 +1,147 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// +// +// class utc_clock; + +// template +// std::chrono::leap_second_info get_leap_second_info(const utc_time& ut); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" +#include "filesystem_test_helper.h" +#include "test_tzdb.h" + +scoped_test_env env; +[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo"); +const std::filesystem::path tzdata = env.create_file("zoneinfo/tzdata.zi"); +const std::filesystem::path leap_seconds = env.create_file("zoneinfo/leap-seconds.list"); + +std::string_view std::chrono::__libcpp_tzdb_directory() { + static std::string result = dir.string(); + return result; +} + +static void write(std::string_view input) { + static int version = 0; + + { + std::ofstream f{tzdata}; + f << "# version " << version++ << '\n'; + std::ofstream{leap_seconds}.write(input.data(), input.size()); + } + std::chrono::reload_tzdb(); +} + +template +static void test_leap_second_info( + std::chrono::time_point time, bool is_leap_second, std::chrono::seconds elapsed) { + std::chrono::leap_second_info result = std::chrono::get_leap_second_info(time); + TEST_REQUIRE( + result.is_leap_second == is_leap_second && result.elapsed == elapsed, + TEST_WRITE_CONCATENATED( + "\nExpected output [", + is_leap_second, + ", ", + elapsed, + "]\nActual output [", + result.is_leap_second, + ", ", + result.elapsed, + "]\n")); +} + +static void test_no_leap_seconds_entries() { + using namespace std::literals::chrono_literals; + + write(""); + + test_leap_second_info( + std::chrono::utc_seconds{std::chrono::sys_days{std::chrono::January / 1 / 1900}.time_since_epoch()}, false, 0s); + test_leap_second_info( + std::chrono::utc_seconds{std::chrono::sys_days{std::chrono::January / 1 / 2000}.time_since_epoch()}, false, 0s); + test_leap_second_info( + std::chrono::utc_seconds{std::chrono::sys_days{std::chrono::January / 1 / 3000}.time_since_epoch()}, false, 0s); +} + +// Note at the time of writing all leap seconds are positive. This test uses +// fake data to test the behaviour of negative leap seconds. +static void test_negative_leap_seconds() { + using namespace std::literals::chrono_literals; + + // Use small values for simplicity. The dates are seconds since 1.1.1900. + write( + R"( +1 10 +60 11 +120 12 +180 11 +240 12 +300 13 +360 12 +)"); + + // Transitions from the start of UTC. + auto test_transition = [](std::chrono::utc_seconds time, std::chrono::seconds elapsed, bool positive) { + if (positive) { + // Every transition has the following tests + // - 1ns before the start of the transition is_leap_second -> false, elapsed -> elapsed + // - at the start of the transition is_leap_second -> true, elapsed -> elapsed + 1 + // - 1ns after the start of the transition is_leap_second -> true, elapsed -> elapsed + 1 + // - 1ns before the end of the transition is_leap_second -> true, elapsed -> elapsed + 1 + // - at the end of the transition is_leap_second -> false, elapsed -> elapsed + 1 + + test_leap_second_info(time - 1ns, false, elapsed); + test_leap_second_info(time, true, elapsed + 1s); + test_leap_second_info(time + 1ns, true, elapsed + 1s); + test_leap_second_info(time + 1s - 1ns, true, elapsed + 1s); + test_leap_second_info(time + 1s, false, elapsed + 1s); + } else { + // Every transition has the following tests + // - 1ns before the transition is_leap_second -> false, elapsed -> elapsed + // - at the transition is_leap_second -> false elapsed -> elapsed - 1 + // - 1ns after the transition is_leap_second -> false, elapsed -> elapsed - 1 + test_leap_second_info(time - 1ns, false, elapsed); + test_leap_second_info(time, false, elapsed - 1s); + test_leap_second_info(time + 1ns, false, elapsed - 1s); + } + }; + + std::chrono::utc_seconds epoch{std::chrono::sys_days{std::chrono::January / 1 / 1900}.time_since_epoch()}; + test_leap_second_info(epoch, false, 0s); + + // The UTC times are: + // epoch + transition time in the database + leap seconds before the transition. + test_transition(epoch + 60s + 0s, 0s, true); + test_transition(epoch + 120s + 1s, 1s, true); + test_transition(epoch + 180s + 2s, 2s, false); + test_transition(epoch + 240s + 1s, 1s, true); + test_transition(epoch + 300s + 2s, 2s, true); + test_transition(epoch + 360s + 3s, 3s, false); +} + +int main(int, const char**) { + test_no_leap_seconds_entries(); + test_negative_leap_seconds(); + + return 0; +} diff --git a/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp new file mode 100644 index 0000000000000..2468daa95c29d --- /dev/null +++ b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// +// +// class utc_clock; + +// template +// static utc_time> +// from_sys(const sys_time& time); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" +#include "filesystem_test_helper.h" +#include "test_tzdb.h" + +scoped_test_env env; +[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo"); +const std::filesystem::path tzdata = env.create_file("zoneinfo/tzdata.zi"); +const std::filesystem::path leap_seconds = env.create_file("zoneinfo/leap-seconds.list"); + +std::string_view std::chrono::__libcpp_tzdb_directory() { + static std::string result = dir.string(); + return result; +} + +static void write(std::string_view input) { + static int version = 0; + + std::ofstream f{tzdata}; + f << "# version " << version++ << '\n'; + std::ofstream{leap_seconds}.write(input.data(), input.size()); +} + +template +static void +test_leap_seconds(std::chrono::time_point time, std::chrono::seconds expected) { + auto utc = std::chrono::utc_clock::from_sys(time); + auto diff = utc.time_since_epoch() - time.time_since_epoch(); + TEST_REQUIRE( + diff == expected, + TEST_WRITE_CONCATENATED("\tTime: ", time, "\nExpected output ", expected, "\nActual output ", diff, '\n')); +} + +// Note at the time of writing all leap seconds are positive. This test uses +// fake data to test the behaviour of negative leap seconds. +int main(int, const char**) { + using namespace std::literals::chrono_literals; + + // Use small values for simplicity. The dates are seconds since 1.1.1970. + write( + R"( +1 10 +60 11 +120 12 +180 11 +240 12 +300 13 +360 12 +)"); + + std::chrono::sys_days epoch = {std::chrono::January / 1 / 1900}; + test_leap_seconds(epoch, 0s); + + test_leap_seconds(epoch + 60s - 1ns, 0s); + test_leap_seconds(epoch + 60s, 1s); + test_leap_seconds(epoch + 60s + 1ns, 1s); + + test_leap_seconds(epoch + 120s - 1ns, 1s); + test_leap_seconds(epoch + 120s, 2s); + test_leap_seconds(epoch + 120s + 1ns, 2s); + + test_leap_seconds(epoch + 180s - 1ns, 2s); + test_leap_seconds(epoch + 180s, 1s); + test_leap_seconds(epoch + 180s + 1ns, 1s); + + test_leap_seconds(epoch + 240s - 1ns, 1s); + test_leap_seconds(epoch + 240s, 2s); + test_leap_seconds(epoch + 240s + 1ns, 2s); + + test_leap_seconds(epoch + 300s - 1ns, 2s); + test_leap_seconds(epoch + 300s, 3s); + test_leap_seconds(epoch + 300s + 1ns, 3s); + + test_leap_seconds(epoch + 360s - 1ns, 3s); + test_leap_seconds(epoch + 360s, 2s); + test_leap_seconds(epoch + 360s + 1ns, 2s); + + return 0; +} diff --git a/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp new file mode 100644 index 0000000000000..ab4dff46d9184 --- /dev/null +++ b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// +// +// class utc_clock; + +// static sys_time> +// to_sys(const utc_time<_Duration>& __time); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" +#include "filesystem_test_helper.h" +#include "test_tzdb.h" + +scoped_test_env env; +[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo"); +const std::filesystem::path tzdata = env.create_file("zoneinfo/tzdata.zi"); +const std::filesystem::path leap_seconds = env.create_file("zoneinfo/leap-seconds.list"); + +std::string_view std::chrono::__libcpp_tzdb_directory() { + static std::string result = dir.string(); + return result; +} + +static void write(std::string_view input) { + static int version = 0; + + std::ofstream f{tzdata}; + f << "# version " << version++ << '\n'; + std::ofstream{leap_seconds}.write(input.data(), input.size()); +} + +template +static void test_leap_seconds(std::chrono::utc_time time, std::chrono::sys_time expected) { + auto result = std::chrono::utc_clock::to_sys(time); + TEST_REQUIRE(result == expected, + TEST_WRITE_CONCATENATED("\nExpected output ", expected, "\nActual output ", result, '\n')); +} + +// Note at the time of writing all leap seconds are positive. This test uses +// fake data to test the behaviour of negative leap seconds. +int main(int, const char**) { + using namespace std::literals::chrono_literals; + + // Use small values for simplicity. The dates are seconds since 1.1.1970. + write( + R"( +1 10 +60 11 +120 12 +180 11 +240 12 +300 13 +360 12 +)"); + + std::chrono::sys_seconds sys_epoch{std::chrono::sys_days{std::chrono::January / 1 / 1900}}; + std::chrono::utc_seconds utc_epoch{sys_epoch.time_since_epoch()}; + + test_leap_seconds(utc_epoch, sys_epoch); + auto test_transition = [](std::chrono::sys_seconds sys, std::chrono::seconds elapsed, bool positive) { + std::chrono::utc_seconds utc = std::chrono::utc_seconds{sys.time_since_epoch()} + elapsed; + if (positive) { + // Every transition has the following tests + // - 1ns before the start of the transition no adjustment needed + // - at the start of the transition sys is clamped at the time just prior to the moment + // of the leap second insertion. The exact value depends + // on the resolution of the result type. + // - 1ns before the end of the transition sys is still clamped like before + // - at the end of the transition sys is 1s behind the utc time + // - 1ns after the end of the transition sys is still 1s behind the utc time + test_leap_seconds(utc - 1ns, sys - 1ns); + test_leap_seconds(utc, sys - 1s); + test_leap_seconds(utc + 0ns, sys - 1ns); + test_leap_seconds(utc + 1s - 1ns, sys - 1ns); + test_leap_seconds(utc + 1s, sys); + test_leap_seconds(utc + 1s + 0ns, sys + 0ns); + test_leap_seconds(utc + 1s + 1ns, sys + 1ns); + } else { + // Every transition has the following tests + // - 1ns before the transition no adjustment needed + // - at the transition sys is 1s ahead of the utc time + // - 1ns after the transition sys is still 1s ahead of the utc time + test_leap_seconds(utc - 1ns, sys - 1ns); + test_leap_seconds(utc, sys + 1s); + test_leap_seconds(utc + 1ns, sys + 1s + 1ns); + } + }; + + test_transition(sys_epoch + 60s, 0s, true); + test_transition(sys_epoch + 120s, 1s, true); + test_transition(sys_epoch + 180s, 2s, false); + test_transition(sys_epoch + 240s, 1s, true); + test_transition(sys_epoch + 300s, 2s, true); + test_transition(sys_epoch + 360s, 3s, false); + + return 0; +} diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp new file mode 100644 index 0000000000000..9d06d479ad90c --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp @@ -0,0 +1,128 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// +// +// class utc_clock; + +// template +// leap_second_info get_leap_second_info(const utc_time& ut); + +#include +#include + +#include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" + +template +static void test_leap_second_info( + std::chrono::time_point time, bool is_leap_second, std::chrono::seconds elapsed) { + std::chrono::leap_second_info result = std::chrono::get_leap_second_info(time); + TEST_REQUIRE( + result.is_leap_second == is_leap_second && result.elapsed == elapsed, + TEST_WRITE_CONCATENATED( + "\nExpected output [", + is_leap_second, + ", ", + elapsed, + "]\nActual output [", + result.is_leap_second, + ", ", + result.elapsed, + "]\n")); +} + +static std::chrono::utc_seconds get_utc_time(long long seconds_since_1900) { + // The file leap-seconds.list stores dates since 1 January 1900, 00:00:00, we want + // seconds since 1 January 1970. + constexpr auto offset = + std::chrono::sys_days{std::chrono::January / 1 / 1970} - std::chrono::sys_days{std::chrono::January / 1 / 1900}; + return std::chrono::utc_seconds{std::chrono::seconds{seconds_since_1900} - offset}; +} + +// Tests set of existing database entries at the time of writing. +int main(int, const char**) { + using namespace std::literals::chrono_literals; + + test_leap_second_info(std::chrono::utc_seconds::min(), false, 0s); + + // Epoch transition no transitions. + test_leap_second_info(std::chrono::utc_seconds{-1s}, false, 0s); + test_leap_second_info(std::chrono::utc_seconds{0s}, false, 0s); + test_leap_second_info(std::chrono::utc_seconds{1s}, false, 0s); + + // Transitions from the start of UTC. + auto test_transition = [](std::chrono::utc_seconds time, std::chrono::seconds elapsed, bool positive) { + // Note at the time of writing all leap seconds are positive so the else + // branch is never executed. The private test for this function tests + // negative leap seconds and uses the else branch. + + if (positive) { + // Every transition has the following tests + // - 1ns before the start of the transition is_leap_second -> false, elapsed -> elapsed + // - at the start of the transition is_leap_second -> true, elapsed -> elapsed + 1 + // - 1ns after the start of the transition is_leap_second -> true, elapsed -> elapsed + 1 + // - 1ns before the end of the transition is_leap_second -> true, elapsed -> elapsed + 1 + // - at the end of the transition is_leap_second -> false, elapsed -> elapsed + 1 + + test_leap_second_info(time - 1ns, false, elapsed); + test_leap_second_info(time, true, elapsed + 1s); + test_leap_second_info(time + 1ns, true, elapsed + 1s); + test_leap_second_info(time + 1s - 1ns, true, elapsed + 1s); + test_leap_second_info(time + 1s, false, elapsed + 1s); + } else { + // Every transition has the following tests + // - 1ns before the transition is_leap_second -> false, elapsed -> elapsed + // - at the transition is_leap_second -> false elapsed -> elapsed - 1 + // - 1ns after the transition is_leap_second -> false, elapsed -> elapsed - 1 + test_leap_second_info(time - 1ns, false, elapsed); + test_leap_second_info(time, false, elapsed - 1s); + test_leap_second_info(time + 1ns, false, elapsed - 1s); + } + }; + + // The timestamps are from leap-seconds.list in the IANA database. + // Note the times stamps are timestamps without leap seconds so the number + // here are incremented by x "leap seconds". + test_transition(get_utc_time(2287785600 + 0), 0s, true); // 1 Jul 1972 + test_transition(get_utc_time(2303683200 + 1), 1s, true); // 1 Jan 1973 + test_transition(get_utc_time(2335219200 + 2), 2s, true); // 1 Jan 1974 + test_transition(get_utc_time(2366755200 + 3), 3s, true); // 1 Jan 1975 + test_transition(get_utc_time(2398291200 + 4), 4s, true); // 1 Jan 1976 + test_transition(get_utc_time(2429913600 + 5), 5s, true); // 1 Jan 1977 + test_transition(get_utc_time(2461449600 + 6), 6s, true); // 1 Jan 1978 + test_transition(get_utc_time(2492985600 + 7), 7s, true); // 1 Jan 1979 + test_transition(get_utc_time(2524521600 + 8), 8s, true); // 1 Jan 1980 + test_transition(get_utc_time(2571782400 + 9), 9s, true); // 1 Jul 1981 + test_transition(get_utc_time(2603318400 + 10), 10s, true); // 1 Jul 1982 + test_transition(get_utc_time(2634854400 + 11), 11s, true); // 1 Jul 1983 + test_transition(get_utc_time(2698012800 + 12), 12s, true); // 1 Jul 1985 + test_transition(get_utc_time(2776982400 + 13), 13s, true); // 1 Jan 1988 + test_transition(get_utc_time(2840140800 + 14), 14s, true); // 1 Jan 1990 + test_transition(get_utc_time(2871676800 + 15), 15s, true); // 1 Jan 1991 + test_transition(get_utc_time(2918937600 + 16), 16s, true); // 1 Jul 1992 + test_transition(get_utc_time(2950473600 + 17), 17s, true); // 1 Jul 1993 + test_transition(get_utc_time(2982009600 + 18), 18s, true); // 1 Jul 1994 + test_transition(get_utc_time(3029443200 + 19), 19s, true); // 1 Jan 1996 + test_transition(get_utc_time(3076704000 + 20), 20s, true); // 1 Jul 1997 + test_transition(get_utc_time(3124137600 + 21), 21s, true); // 1 Jan 1999 + test_transition(get_utc_time(3345062400 + 22), 22s, true); // 1 Jan 2006 + test_transition(get_utc_time(3439756800 + 23), 23s, true); // 1 Jan 2009 + test_transition(get_utc_time(3550089600 + 24), 24s, true); // 1 Jul 2012 + test_transition(get_utc_time(3644697600 + 25), 25s, true); // 1 Jul 2015 + test_transition(get_utc_time(3692217600 + 26), 26s, true); // 1 Jan 2017 + + return 0; +} diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp new file mode 100644 index 0000000000000..90cf99d4b30c7 --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// + +// struct leap_second_info { +// bool is_leap_second; +// seconds elapsed; +// }; + +#include +#include + +// Validates whether: +// - The members are present as non-const members. +// - The struct is an aggregate. +int main(int, const char**) { + static_assert(std::is_aggregate_v); + + std::chrono::leap_second_info leap_second_info{.is_leap_second = false, .elapsed = std::chrono::seconds(0)}; + + [[maybe_unused]] bool& is_leap_second = leap_second_info.is_leap_second; + [[maybe_unused]] std::chrono::seconds& elapsed = leap_second_info.elapsed; + + return 0; +} diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp new file mode 100644 index 0000000000000..ab22cfafa2b0f --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp @@ -0,0 +1,245 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// +// +// class utc_clock; + +// template +// static utc_time> +// from_sys(const sys_time& time); + +#include +#include + +#include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" + +template +static void test_leap_seconds(std::chrono::time_point time, + std::chrono::seconds leap_seconds) { + auto utc = std::chrono::utc_clock::from_sys(time); + auto diff = utc.time_since_epoch() - time.time_since_epoch(); + TEST_REQUIRE( + diff == leap_seconds, + TEST_WRITE_CONCATENATED("\tTime: ", time, "\nExpected output ", leap_seconds, "\nActual output ", diff, '\n')); +} + +// This test is based on the example in [time.clock.utc.members]/3 +static void test_example_standard() { + using namespace std::literals::chrono_literals; + + auto t = std::chrono::sys_days{std::chrono::July / 1 / 2015} - 2ns; + test_leap_seconds(t, 25s); + + t += 1ns; + test_leap_seconds(t, 25s); + + t += 1ns; + test_leap_seconds(t, 26s); + + t += 1ns; + test_leap_seconds(t, 26s); +} + +// Tests set of existing database entries at the time of writing. +static void test_transitions() { + using namespace std::literals::chrono_literals; + + test_leap_seconds(std::chrono::sys_seconds::min(), 0s); + test_leap_seconds(std::chrono::sys_days::min(), 0s); + + // Epoch transition no transitions. + test_leap_seconds(std::chrono::sys_seconds{-1s}, 0s); + test_leap_seconds(std::chrono::sys_seconds{0s}, 0s); + test_leap_seconds(std::chrono::sys_seconds{1s}, 0s); + + // Transitions from the start of UTC. + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1972} - 1ns, 0s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1972}, 0s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1972} + 1ns, 0s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1972} - 1ns, 0s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1972}, 1s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1972} + 1ns, 1s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1973} - 1ns, 1s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1973}, 2s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1973} + 1ns, 2s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1974} - 1ns, 2s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1974}, 3s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1974} + 1ns, 3s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1975} - 1ns, 3s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1975}, 4s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1975} + 1ns, 4s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1976} - 1ns, 4s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1976}, 5s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1976} + 1ns, 5s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1977} - 1ns, 5s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1977}, 6s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1977} + 1ns, 6s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1978} - 1ns, 6s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1978}, 7s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1978} + 1ns, 7s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1979} - 1ns, 7s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1979}, 8s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1979} + 1ns, 8s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1980} - 1ns, 8s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1980}, 9s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1980} + 1ns, 9s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1981} - 1ns, 9s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1981}, 10s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1981} + 1ns, 10s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1982} - 1ns, 10s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1982}, 11s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1982} + 1ns, 11s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1983} - 1ns, 11s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1983}, 12s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1983} + 1ns, 12s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1985} - 1ns, 12s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1985}, 13s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1985} + 1ns, 13s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1988} - 1ns, 13s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1988}, 14s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1988} + 1ns, 14s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1990} - 1ns, 14s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1990}, 15s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1990} + 1ns, 15s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1991} - 1ns, 15s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1991}, 16s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1991} + 1ns, 16s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1992} - 1ns, 16s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1992}, 17s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1992} + 1ns, 17s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1993} - 1ns, 17s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1993}, 18s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1993} + 1ns, 18s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1994} - 1ns, 18s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1994}, 19s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1994} + 1ns, 19s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1996} - 1ns, 19s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1996}, 20s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1996} + 1ns, 20s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1997} - 1ns, 20s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1997}, 21s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1997} + 1ns, 21s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1999} - 1ns, 21s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1999}, 22s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1999} + 1ns, 22s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2006} - 1ns, 22s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2006}, 23s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2006} + 1ns, 23s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2009} - 1ns, 23s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2009}, 24s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2009} + 1ns, 24s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2012} - 1ns, 24s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2012}, 25s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2012} + 1ns, 25s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2015} - 1ns, 25s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2015}, 26s); + test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2015} + 1ns, 26s); + + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2017} - 1ns, 26s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2017}, 27s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2017} + 1ns, 27s); + + // This validates status when the tests were written. + // It's not possible to test the future; there might be additional leap + // seconds in the future. + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2024} - 1ns, 27s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2024}, 27s); + test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2024} + 1ns, 27s); +} + +// Tests whether the return type is the expected type. +static void test_return_type() { + namespace cr = std::chrono; + using namespace std::literals::chrono_literals; + + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{0ns}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{0us}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{0ms}); + } + + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::seconds{0}}); + } + + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::minutes{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::hours{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::days{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::weeks{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::months{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::from_sys(cr::sys_time{cr::years{0}}); + } +} + +int main(int, const char**) { + test_example_standard(); + test_transitions(); + test_return_type(); + + return 0; +} diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp new file mode 100644 index 0000000000000..2b6967b1c983a --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// +// +// class utc_clock; + +// static time_point now(); + +#include +#include +#include + +int main(int, const char**) { + using clock = std::chrono::utc_clock; + std::same_as decltype(auto) t = clock::now(); + + assert(t >= clock::time_point::min()); + assert(t <= clock::time_point::max()); + + auto t2 = clock::now(); + assert(t2 - t >= std::chrono::seconds(0)); + // This may fail if the tests takes a long time to complete. + assert(t2 - t < std::chrono::seconds(42)); + + return 0; +} diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp new file mode 100644 index 0000000000000..9b43ca4c0dde0 --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp @@ -0,0 +1,252 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// + +// class utc_clock; + +// static sys_time> +// to_sys(const utc_time<_Duration>& __time); + +#include +#include +#include + +#include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" + +template +static void test_leap_seconds(std::chrono::utc_time time, std::chrono::sys_time expected) { + auto result = std::chrono::utc_clock::to_sys(time); + TEST_REQUIRE( + result == expected, + TEST_WRITE_CONCATENATED("\tTime: ", time, "\nExpected output ", expected, "\nActual output ", result, '\n')); +} + +static std::chrono::sys_seconds get_sys_time(long long seconds_since_1900) { + // The file leap-seconds.list stores dates since 1 January 1900, 00:00:00, we want + // seconds since 1 January 1970. + constexpr auto offset = + std::chrono::sys_days{std::chrono::January / 1 / 1970} - std::chrono::sys_days{std::chrono::January / 1 / 1900}; + return std::chrono::sys_seconds{std::chrono::seconds{seconds_since_1900} - offset}; +} + +// Tests the set of existing database entries at the time of writing. Since +// the last leap second insertion is several years ago, it's expected all +// systems have the same information. (Adding new entries in the future does +// not affect this test.) +static void test_transitions() { + using namespace std::literals::chrono_literals; + + test_leap_seconds(std::chrono::utc_seconds::min(), std::chrono::sys_seconds::min()); + + // Epoch transition no transitions. + test_leap_seconds(std::chrono::utc_seconds{-1s}, std::chrono::sys_seconds{-1s}); + test_leap_seconds(std::chrono::utc_seconds{0s}, std::chrono::sys_seconds{0s}); + test_leap_seconds(std::chrono::utc_seconds{1s}, std::chrono::sys_seconds{1s}); + + // "sys" is the time of the transition to the next leap second. + // "elapsed" is the number of leap seconds before the transition. + // "positive" is the leap second added +1s? If not it's -1s. + auto test_transition = [](std::chrono::sys_seconds sys, std::chrono::seconds elapsed, bool positive) { + // Note at the time of writing all leap seconds are positive so the else + // branch is never executed. The private test for this function tests + // negative leap seconds and uses the else branch. + + std::chrono::utc_seconds utc = std::chrono::utc_seconds{sys.time_since_epoch()} + elapsed; + if (positive) { + // Every transition has the following tests + // - 1ns before the start of the transition no adjustment needed + // - at the start of the transition sys is clamped at the time just prior to the moment + // of the leap second insertion. The exact value depends + // on the resolution of the result type. + // - 1ns before the end of the transition sys is still clamped like before + // - at the end of the transition sys is 1s behind the utc time + // - 1ns after the end of the transition sys is still 1s behind the utc time + test_leap_seconds(utc - 1ns, sys - 1ns); + test_leap_seconds(utc, sys - 1s); + test_leap_seconds(utc + 0ns, sys - 1ns); + test_leap_seconds(utc + 1s - 1ns, sys - 1ns); + test_leap_seconds(utc + 1s, sys); + test_leap_seconds(utc + 1s + 0ns, sys + 0ns); + test_leap_seconds(utc + 1s + 1ns, sys + 1ns); + } else { + // Every transition has the following tests + // - 1ns before the transition no adjustment needed + // - at the transition sys is 1s ahead of the utc time + // - 1ns after the transition sys is still 1s ahead of the utc time + test_leap_seconds(utc - 1ns, sys - 1ns); + test_leap_seconds(utc, sys + 1s); + test_leap_seconds(utc + 1ns, sys + 1s + 1ns); + } + }; + + // Transitions from the start of UTC. + test_transition(get_sys_time(2287785600), 0s, true); // 1 Jul 1972 + test_transition(get_sys_time(2303683200), 1s, true); // 1 Jan 1973 + test_transition(get_sys_time(2335219200), 2s, true); // 1 Jan 1974 + test_transition(get_sys_time(2366755200), 3s, true); // 1 Jan 1975 + test_transition(get_sys_time(2398291200), 4s, true); // 1 Jan 1976 + test_transition(get_sys_time(2429913600), 5s, true); // 1 Jan 1977 + test_transition(get_sys_time(2461449600), 6s, true); // 1 Jan 1978 + test_transition(get_sys_time(2492985600), 7s, true); // 1 Jan 1979 + test_transition(get_sys_time(2524521600), 8s, true); // 1 Jan 1980 + test_transition(get_sys_time(2571782400), 9s, true); // 1 Jul 1981 + test_transition(get_sys_time(2603318400), 10s, true); // 1 Jul 1982 + test_transition(get_sys_time(2634854400), 11s, true); // 1 Jul 1983 + test_transition(get_sys_time(2698012800), 12s, true); // 1 Jul 1985 + test_transition(get_sys_time(2776982400), 13s, true); // 1 Jan 1988 + test_transition(get_sys_time(2840140800), 14s, true); // 1 Jan 1990 + test_transition(get_sys_time(2871676800), 15s, true); // 1 Jan 1991 + test_transition(get_sys_time(2918937600), 16s, true); // 1 Jul 1992 + test_transition(get_sys_time(2950473600), 17s, true); // 1 Jul 1993 + test_transition(get_sys_time(2982009600), 18s, true); // 1 Jul 1994 + test_transition(get_sys_time(3029443200), 19s, true); // 1 Jan 1996 + test_transition(get_sys_time(3076704000), 20s, true); // 1 Jul 1997 + test_transition(get_sys_time(3124137600), 21s, true); // 1 Jan 1999 + test_transition(get_sys_time(3345062400), 22s, true); // 1 Jan 2006 + test_transition(get_sys_time(3439756800), 23s, true); // 1 Jan 2009 + test_transition(get_sys_time(3550089600), 24s, true); // 1 Jul 2012 + test_transition(get_sys_time(3644697600), 25s, true); // 1 Jul 2015 + test_transition(get_sys_time(3692217600), 26s, true); // 1 Jan 2017 +} + +// Tests the transition for clocks where the duration's rep is a floating-point type. +static void test_transitions_floating_point() { + using namespace std::literals::chrono_literals; + + // Based on test_transitions but uses a floating-point duration. + using F = float; + + auto test_transition = [](std::chrono::sys_seconds sys, std::chrono::seconds elapsed, bool positive) { + // Note at the time of writing all leap seconds are positive so the else + // branch is never executed. The private test for this function tests + // negative leap seconds and uses the else branch. + + std::chrono::utc_seconds utc = std::chrono::utc_seconds{sys.time_since_epoch()} + elapsed; + + using D = std::chrono::duration; + using S = std::chrono ::time_point; + using U = std::chrono ::time_point; + + S s{sys.time_since_epoch()}; + bool is_leap_second = s.time_since_epoch().count() == sys.time_since_epoch().count(); + assert(is_leap_second); + + U u{utc.time_since_epoch()}; + if (positive) { + test_leap_seconds(u - 1ns, s - 1ns); + test_leap_seconds(u, s - 1s); + test_leap_seconds(u + 0ns, s - 1ns); + test_leap_seconds(u + 1s - 1ns, s - 1ns); + test_leap_seconds(u + 1s, s); + test_leap_seconds(u + 1s + 0ns, s + 0ns); + test_leap_seconds(u + 1s + 1ns, s + 1ns); + + test_leap_seconds(U{D{std::nextafter(u.time_since_epoch().count(), F{0})}}, + S{D{std::nextafter(s.time_since_epoch().count(), F{0})}}); + test_leap_seconds(u, S{D{s.time_since_epoch().count() - F{1}}}); + test_leap_seconds(U{D{u.time_since_epoch().count() + F{1}}}, s); + test_leap_seconds(U{D{std::nextafter(u.time_since_epoch().count() + F{1}, std::numeric_limits::max())}}, + S{D{std::nextafter(s.time_since_epoch().count(), std::numeric_limits::max())}}); + } + }; + + // Transitions from the start of UTC. + test_transition(get_sys_time(2287785600), 0s, true); // 1 Jul 1972 + test_transition(get_sys_time(2303683200), 1s, true); // 1 Jan 1973 + test_transition(get_sys_time(2335219200), 2s, true); // 1 Jan 1974 + test_transition(get_sys_time(2366755200), 3s, true); // 1 Jan 1975 + test_transition(get_sys_time(2398291200), 4s, true); // 1 Jan 1976 + test_transition(get_sys_time(2429913600), 5s, true); // 1 Jan 1977 + test_transition(get_sys_time(2461449600), 6s, true); // 1 Jan 1978 + test_transition(get_sys_time(2492985600), 7s, true); // 1 Jan 1979 + test_transition(get_sys_time(2524521600), 8s, true); // 1 Jan 1980 + test_transition(get_sys_time(2571782400), 9s, true); // 1 Jul 1981 + test_transition(get_sys_time(2603318400), 10s, true); // 1 Jul 1982 + test_transition(get_sys_time(2634854400), 11s, true); // 1 Jul 1983 + test_transition(get_sys_time(2698012800), 12s, true); // 1 Jul 1985 + test_transition(get_sys_time(2776982400), 13s, true); // 1 Jan 1988 + test_transition(get_sys_time(2840140800), 14s, true); // 1 Jan 1990 + test_transition(get_sys_time(2871676800), 15s, true); // 1 Jan 1991 + test_transition(get_sys_time(2918937600), 16s, true); // 1 Jul 1992 + test_transition(get_sys_time(2950473600), 17s, true); // 1 Jul 1993 + test_transition(get_sys_time(2982009600), 18s, true); // 1 Jul 1994 + test_transition(get_sys_time(3029443200), 19s, true); // 1 Jan 1996 + test_transition(get_sys_time(3076704000), 20s, true); // 1 Jul 1997 + test_transition(get_sys_time(3124137600), 21s, true); // 1 Jan 1999 + test_transition(get_sys_time(3345062400), 22s, true); // 1 Jan 2006 + test_transition(get_sys_time(3439756800), 23s, true); // 1 Jan 2009 + test_transition(get_sys_time(3550089600), 24s, true); // 1 Jul 2012 + test_transition(get_sys_time(3644697600), 25s, true); // 1 Jul 2015 + test_transition(get_sys_time(3692217600), 26s, true); // 1 Jan 2017 +} + +// Tests whether the return type is the expected type. +static void test_return_type() { + namespace cr = std::chrono; + using namespace std::literals::chrono_literals; + + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{0ns}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{0us}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{0ms}); + } + + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::seconds{0}}); + } + + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::minutes{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::hours{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::days{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::weeks{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::months{0}}); + } + { + [[maybe_unused]] std::same_as> decltype(auto) _ = + cr::utc_clock::to_sys(cr::utc_time{cr::years{0}}); + } +} + +int main(int, const char**) { + test_transitions(); + test_transitions_floating_point(); + test_return_type(); + + return 0; +} diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp new file mode 100644 index 0000000000000..0322e9122e1cd --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// + +// class utc_clock { +// public: +// using rep = a signed arithmetic type; +// using period = ratio; +// using duration = chrono::duration; +// using time_point = chrono::time_point; +// static constexpr bool is_steady = unspecified; +// +// ... +// }; +// +// template +// using utc_time = time_point; +// using utc_seconds = utc_time; + +#include +#include +#include + +#include "test_macros.h" + +// class utc_clock +using rep = std::chrono::utc_clock::rep; +using period = std::chrono::utc_clock::period; +using duration = std::chrono::utc_clock::duration; +using time_point = std::chrono::utc_clock::time_point; +constexpr bool is_steady = std::chrono::utc_clock::is_steady; + +// Tests the values. Some of them are implementation-defined. +LIBCPP_STATIC_ASSERT(std::same_as); +static_assert(std::is_arithmetic_v); +static_assert(std::is_signed_v); + +LIBCPP_STATIC_ASSERT(std::same_as); +static_assert(std::same_as>); + +static_assert(std::same_as>); +static_assert(std::same_as>); +LIBCPP_STATIC_ASSERT(is_steady == false); + +// typedefs +static_assert(std::same_as, std::chrono::time_point>); +static_assert(std::same_as, std::chrono::time_point>); +static_assert(std::same_as>); diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp new file mode 100644 index 0000000000000..8fd3b8a3e1d47 --- /dev/null +++ b/libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp @@ -0,0 +1,165 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb +// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME + +// TODO FMT This test should not require std::to_chars(floating-point) +// XFAIL: availability-fp_to_chars-missing + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// REQUIRES: locale.fr_FR.UTF-8 +// REQUIRES: locale.ja_JP.UTF-8 + +// + +// using utc_time = ...; + +// template +// basic_ostream& +// operator<<(basic_ostream& os, const utc_time& tp); + +#include +#include +#include +#include + +#include "make_string.h" +#include "platform_support.h" // locale name macros +#include "test_macros.h" + +#define SV(S) MAKE_STRING_VIEW(CharT, S) + +template +static std::basic_string stream_c_locale(std::chrono::utc_time time_point) { + std::basic_stringstream sstr; + sstr << std::fixed << time_point; + return sstr.str(); +} + +template +static std::basic_string stream_fr_FR_locale(std::chrono::utc_time time_point) { + std::basic_stringstream sstr; + const std::locale locale(LOCALE_fr_FR_UTF_8); + sstr.imbue(locale); + sstr << std::fixed << time_point; + return sstr.str(); +} + +template +static std::basic_string stream_ja_JP_locale(std::chrono::utc_time time_point) { + std::basic_stringstream sstr; + const std::locale locale(LOCALE_ja_JP_UTF_8); + sstr.imbue(locale); + sstr << std::fixed << time_point; + return sstr.str(); +} + +template +static void test_c() { + using namespace std::literals::chrono_literals; + + assert(stream_c_locale(std::chrono::utc_time{946'688'523'123'456'789ns}) == + SV("2000-01-01 01:01:41.123456789")); + assert(stream_c_locale(std::chrono::utc_time{946'688'523'123'456us}) == + SV("2000-01-01 01:01:41.123456")); + + assert(stream_c_locale(std::chrono::utc_time{946'684'822'123ms}) == + SV("2000-01-01 00:00:00.123")); + assert(stream_c_locale(std::chrono::utc_seconds{1'234'567'890s}) == SV("2009-02-13 23:31:06")); + assert(stream_c_locale(std::chrono::utc_time{20'576'131min}) == + SV("2009-02-13 23:30:36")); + assert(stream_c_locale(std::chrono::utc_time{342'935h}) == SV("2009-02-13 22:59:36")); + + assert(stream_c_locale(std::chrono::utc_time>>{ + std::chrono::duration>{60}}) == SV("1970-01-01 00:02:00")); + assert(stream_c_locale(std::chrono::utc_time>>{ + std::chrono::duration>{3600}}) == SV("1970-01-01 00:30:00.0")); + assert(stream_c_locale(std::chrono::utc_time>>{ + std::chrono::duration>{3600}}) == SV("1970-01-01 00:15:00.00")); + assert(stream_c_locale(std::chrono::utc_time>>{ + std::chrono::duration>{36611}}) == SV("1970-01-01 01:01:01.1")); + assert(stream_c_locale(std::chrono::utc_time>>{ + std::chrono::duration>{12'345'678'9010}}) == SV("2009-02-13 23:31:06.10")); +} + +template +static void test_fr_FR() { + using namespace std::literals::chrono_literals; + + assert(stream_fr_FR_locale(std::chrono::utc_time{946'688'523'123'456'789ns}) == + SV("2000-01-01 01:01:41,123456789")); + assert(stream_fr_FR_locale(std::chrono::utc_time{946'688'523'123'456us}) == + SV("2000-01-01 01:01:41,123456")); + + assert(stream_fr_FR_locale(std::chrono::utc_time{946'684'822'123ms}) == + SV("2000-01-01 00:00:00,123")); + assert(stream_fr_FR_locale(std::chrono::utc_seconds{1'234'567'890s}) == SV("2009-02-13 23:31:06")); + assert(stream_fr_FR_locale(std::chrono::utc_time{20'576'131min}) == + SV("2009-02-13 23:30:36")); + assert(stream_fr_FR_locale(std::chrono::utc_time{342'935h}) == SV("2009-02-13 22:59:36")); + + assert(stream_fr_FR_locale(std::chrono::utc_time>>{ + std::chrono::duration>{60}}) == SV("1970-01-01 00:02:00")); + assert(stream_fr_FR_locale(std::chrono::utc_time>>{ + std::chrono::duration>{3600}}) == SV("1970-01-01 00:30:00,0")); + assert(stream_fr_FR_locale(std::chrono::utc_time>>{ + std::chrono::duration>{3600}}) == SV("1970-01-01 00:15:00,00")); + assert(stream_fr_FR_locale(std::chrono::utc_time>>{ + std::chrono::duration>{36611}}) == SV("1970-01-01 01:01:01,1")); + assert(stream_fr_FR_locale(std::chrono::utc_time>>{ + std::chrono::duration>{12'345'678'9010}}) == SV("2009-02-13 23:31:06,10")); +} + +template +static void test_ja_JP() { + using namespace std::literals::chrono_literals; + + assert(stream_ja_JP_locale(std::chrono::utc_time{946'688'523'123'456'789ns}) == + SV("2000-01-01 01:01:41.123456789")); + assert(stream_ja_JP_locale(std::chrono::utc_time{946'688'523'123'456us}) == + SV("2000-01-01 01:01:41.123456")); + + assert(stream_ja_JP_locale(std::chrono::utc_time{946'684'822'123ms}) == + SV("2000-01-01 00:00:00.123")); + assert(stream_ja_JP_locale(std::chrono::utc_seconds{1'234'567'890s}) == SV("2009-02-13 23:31:06")); + assert(stream_ja_JP_locale(std::chrono::utc_time{20'576'131min}) == + SV("2009-02-13 23:30:36")); + assert(stream_ja_JP_locale(std::chrono::utc_time{342'935h}) == SV("2009-02-13 22:59:36")); + + assert(stream_ja_JP_locale(std::chrono::utc_time>>{ + std::chrono::duration>{60}}) == SV("1970-01-01 00:02:00")); + assert(stream_ja_JP_locale(std::chrono::utc_time>>{ + std::chrono::duration>{3600}}) == SV("1970-01-01 00:30:00.0")); + assert(stream_ja_JP_locale(std::chrono::utc_time>>{ + std::chrono::duration>{3600}}) == SV("1970-01-01 00:15:00.00")); + assert(stream_ja_JP_locale(std::chrono::utc_time>>{ + std::chrono::duration>{36611}}) == SV("1970-01-01 01:01:01.1")); + assert(stream_ja_JP_locale(std::chrono::utc_time>>{ + std::chrono::duration>{12'345'678'9010}}) == SV("2009-02-13 23:31:06.10")); +} + +template +static void test() { + test_c(); + test_fr_FR(); + test_ja_JP(); +} + +int main(int, char**) { + test(); + +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + test(); +#endif + + return 0; +} diff --git a/libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp new file mode 100644 index 0000000000000..e6f94bf7fecc6 --- /dev/null +++ b/libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp @@ -0,0 +1,1004 @@ +//===----------------------------------------------------------------------===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-filesystem, no-localization, no-tzdb +// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME + +// TODO FMT This test should not require std::to_chars(floating-point) +// XFAIL: availability-fp_to_chars-missing + +// XFAIL: libcpp-has-no-experimental-tzdb +// XFAIL: availability-tzdb-missing + +// REQUIRES: locale.fr_FR.UTF-8 +// REQUIRES: locale.ja_JP.UTF-8 + +// + +// template +// struct formatter, charT>; + +#include +#include + +#include +#include +#include +#include +#include + +#include "formatter_tests.h" +#include "make_string.h" +#include "platform_support.h" // locale name macros +#include "test_macros.h" + +template +static void test_no_chrono_specs() { + using namespace std::literals::chrono_literals; + + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output + + // [time.syn] + // using nanoseconds = duration; + // using microseconds = duration; + // using milliseconds = duration; + // using seconds = duration; + // using minutes = duration>; + // using hours = duration>; + check(SV("1425-08-04 22:06:56"), SV("{}"), std::chrono::utc_seconds(-17'179'869'184s)); // Minimum value for 35 bits. + check(SV("1901-12-13 20:45:52"), SV("{}"), std::chrono::utc_seconds(-2'147'483'648s)); + + check(SV("1969-12-31 00:00:00"), SV("{}"), std::chrono::utc_seconds(-24h)); + check(SV("1969-12-31 06:00:00"), SV("{}"), std::chrono::utc_seconds(-18h)); + check(SV("1969-12-31 12:00:00"), SV("{}"), std::chrono::utc_seconds(-12h)); + check(SV("1969-12-31 18:00:00"), SV("{}"), std::chrono::utc_seconds(-6h)); + check(SV("1969-12-31 23:59:59"), SV("{}"), std::chrono::utc_seconds(-1s)); + + check(SV("1970-01-01 00:00:00"), SV("{}"), std::chrono::utc_seconds(0s)); + check(SV("2000-01-01 00:00:00"), SV("{}"), std::chrono::utc_seconds(946'684'800s + 22s)); + check(SV("2000-01-01 01:02:03"), SV("{}"), std::chrono::utc_seconds(946'688'523s + 22s)); + + check(SV("2038-01-19 03:14:07"), SV("{}"), std::chrono::utc_seconds(2'147'483'647s + 27s)); + check(SV("2514-05-30 01:53:03"), + SV("{}"), + std::chrono::utc_seconds(17'179'869'183s + 27s)); // Maximum value for 35 bits. + + check(SV("2000-01-01 01:02:03.123"), + SV("{}"), + std::chrono::utc_time(946'688'523'123ms + 22s)); + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_year() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = + SV("{:%%C='%C'%t%%EC='%EC'%t%%y='%y'%t%%Oy='%Oy'%t%%Ey='%Ey'%t%%Y='%Y'%t%%EY='%EY'%n}"); + constexpr std::basic_string_view lfmt = + SV("{:L%%C='%C'%t%%EC='%EC'%t%%y='%y'%t%%Oy='%Oy'%t%%Ey='%Ey'%t%%Y='%Y'%t%%EY='%EY'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"), + fmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use the global locale (fr_FR) + check(SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use supplied locale (ja_JP). This locale has a different alternate. +#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX)||defined(__FreeBSD__) + check(loc, + SV("%C='19'\t%EC='昭和'\t%y='70'\t%Oy='七十'\t%Ey='45'\t%Y='1970'\t%EY='昭和45年'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%C='20'\t%EC='平成'\t%y='09'\t%Oy='九'\t%Ey='21'\t%Y='2009'\t%EY='平成21年'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX)||defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_month() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%b='%b'%t%%h='%h'%t%%B='%B'%t%%m='%m'%t%%Om='%Om'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%b='%b'%t%%h='%h'%t%%B='%B'%t%%m='%m'%t%%Om='%Om'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%b='Jan'\t%h='Jan'\t%B='January'\t%m='01'\t%Om='01'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%b='May'\t%h='May'\t%B='May'\t%m='05'\t%Om='05'\n"), + fmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + // Use the global locale (fr_FR) +#if defined(__APPLE__) + check(SV("%b='jan'\t%h='jan'\t%B='janvier'\t%m='01'\t%Om='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 +#else + check(SV("%b='janv.'\t%h='janv.'\t%B='janvier'\t%m='01'\t%Om='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 +#endif + + check(SV("%b='mai'\t%h='mai'\t%B='mai'\t%m='05'\t%Om='05'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + // Use supplied locale (ja_JP). This locale has a different alternate. +#ifdef _WIN32 + check(loc, + SV("%b='1'\t%h='1'\t%B='1月'\t%m='01'\t%Om='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%b='5'\t%h='5'\t%B='5月'\t%m='05'\t%Om='05'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#elif defined(_AIX) // _WIN32 + check(loc, + SV("%b='1月'\t%h='1月'\t%B='1月'\t%m='01'\t%Om='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%b='5月'\t%h='5月'\t%B='5月'\t%m='05'\t%Om='05'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#elif defined(__APPLE__) // _WIN32 + check(loc, + SV("%b=' 1'\t%h=' 1'\t%B='1月'\t%m='01'\t%Om='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%b=' 5'\t%h=' 5'\t%B='5月'\t%m='05'\t%Om='05'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#elif defined(__FreeBSD__) // _WIN32 + check(loc, + SV("%b=' 1月'\t%h=' 1月'\t%B='1月'\t%m='01'\t%Om='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%b=' 5月'\t%h=' 5月'\t%B='5月'\t%m='05'\t%Om='05'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#else // _WIN32 + check(loc, + SV("%b=' 1月'\t%h=' 1月'\t%B='1月'\t%m='01'\t%Om='一'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%b=' 5月'\t%h=' 5月'\t%B='5月'\t%m='05'\t%Om='五'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#endif // _WIN32 + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_day() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%d='%d'%t%%Od='%Od'%t%%e='%e'%t%%Oe='%Oe'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%d='%d'%t%%Od='%Od'%t%%e='%e'%t%%Oe='%Oe'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"), + fmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use the global locale (fr_FR) + check(SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use supplied locale (ja_JP). This locale has a different alternate. +#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%d='01'\t%Od='一'\t%e=' 1'\t%Oe='一'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%d='13'\t%Od='十三'\t%e='13'\t%Oe='十三'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + +#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_weekday() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = + SV("{:%%a='%a'%t%%A='%A'%t%%u='%u'%t%%Ou='%Ou'%t%%w='%w'%t%%Ow='%Ow'%n}"); + constexpr std::basic_string_view lfmt = + SV("{:L%%a='%a'%t%%A='%A'%t%%u='%u'%t%%Ou='%Ou'%t%%w='%w'%t%%Ow='%Ow'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%a='Thu'\t%A='Thursday'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%a='Sun'\t%A='Sunday'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"), + fmt, + std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106 + + // Use the global locale (fr_FR) +#if defined(__APPLE__) + check(SV("%a='Jeu'\t%A='Jeudi'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%a='Dim'\t%A='Dimanche'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"), + lfmt, + std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106 +#else + check(SV("%a='jeu.'\t%A='jeudi'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%a='dim.'\t%A='dimanche'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"), + lfmt, + std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106 +#endif + + // Use supplied locale (ja_JP). + // This locale has a different alternate, but not on all platforms +#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%a='木'\t%A='木曜日'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%a='日'\t%A='日曜日'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"), + lfmt, + std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106 +#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%a='木'\t%A='木曜日'\t%u='4'\t%Ou='四'\t%w='4'\t%Ow='四'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%a='日'\t%A='日曜日'\t%u='7'\t%Ou='七'\t%w='0'\t%Ow='〇'\n"), + lfmt, + std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106 +#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_day_of_year() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%j='%j'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%j='%j'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%j='001'\n"), fmt, std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + check(SV("%j='138'\n"), fmt, std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + // Use the global locale (fr_FR) + check(SV("%j='001'\n"), lfmt, std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + check(SV("%j='138'\n"), lfmt, std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + // Use supplied locale (ja_JP). This locale has a different alternate. + check(loc, SV("%j='001'\n"), lfmt, std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check( + loc, SV("%j='138'\n"), lfmt, std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_week() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%U='%U'%t%%OU='%OU'%t%%W='%W'%t%%OW='%OW'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%U='%U'%t%%OU='%OU'%t%%W='%W'%t%%OW='%OW'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"), + fmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + // Use the global locale (fr_FR) + check(SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 + + // Use supplied locale (ja_JP). This locale has a different alternate. +#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%U='00'\t%OU='〇'\t%W='00'\t%OW='〇'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%U='20'\t%OU='二十'\t%W='20'\t%OW='二十'\n"), + lfmt, + std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033 +#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_iso_8601_week() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%g='%g'%t%%G='%G'%t%%V='%V'%t%%OV='%OV'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%g='%g'%t%%G='%G'%t%%V='%V'%t%%OV='%OV'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"), + fmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use the global locale (fr_FR) + check(SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use supplied locale (ja_JP). This locale has a different alternate. +#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%g='70'\t%G='1970'\t%V='01'\t%OV='一'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%g='09'\t%G='2009'\t%V='07'\t%OV='七'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_date() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%D='%D'%t%%F='%F'%t%%x='%x'%t%%Ex='%Ex'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%D='%D'%t%%F='%F'%t%%x='%x'%t%%Ex='%Ex'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01/01/70'\t%Ex='01/01/70'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='02/13/09'\t%Ex='02/13/09'\n"), + fmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use the global locale (fr_FR) +#if defined(__APPLE__) || defined(__FreeBSD__) + check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01.01.1970'\t%Ex='01.01.1970'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='13.02.2009'\t%Ex='13.02.2009'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#else + check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01/01/1970'\t%Ex='01/01/1970'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='13/02/2009'\t%Ex='13/02/2009'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#endif + + // Use supplied locale (ja_JP). This locale has a different alternate. +#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%D='01/01/70'\t%F='1970-01-01'\t%x='1970/01/01'\t%Ex='1970/01/01'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%D='02/13/09'\t%F='2009-02-13'\t%x='2009/02/13'\t%Ex='2009/02/13'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + check(loc, + SV("%D='01/01/70'\t%F='1970-01-01'\t%x='1970年01月01日'\t%Ex='昭和45年01月01日'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%D='02/13/09'\t%F='2009-02-13'\t%x='2009年02月13日'\t%Ex='平成21年02月13日'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_time() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV( + "{:" + "%%H='%H'%t" + "%%OH='%OH'%t" + "%%I='%I'%t" + "%%OI='%OI'%t" + "%%M='%M'%t" + "%%OM='%OM'%t" + "%%S='%S'%t" + "%%OS='%OS'%t" + "%%p='%p'%t" + "%%R='%R'%t" + "%%T='%T'%t" + "%%r='%r'%t" + "%%X='%X'%t" + "%%EX='%EX'%t" + "%n}"); + constexpr std::basic_string_view lfmt = SV( + "{:L" + "%%H='%H'%t" + "%%OH='%OH'%t" + "%%I='%I'%t" + "%%OI='%OI'%t" + "%%M='%M'%t" + "%%OM='%OM'%t" + "%%S='%S'%t" + "%%OS='%OS'%t" + "%%p='%p'%t" + "%%R='%R'%t" + "%%T='%T'%t" + "%%r='%r'%t" + "%%X='%X'%t" + "%%EX='%EX'%t" + "%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%H='00'\t" + "%OH='00'\t" + "%I='12'\t" + "%OI='12'\t" + "%M='00'\t" + "%OM='00'\t" + "%S='00'\t" + "%OS='00'\t" + "%p='AM'\t" + "%R='00:00'\t" + "%T='00:00:00'\t" + "%r='12:00:00 AM'\t" + "%X='00:00:00'\t" + "%EX='00:00:00'\t" + "\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%H='23'\t" + "%OH='23'\t" + "%I='11'\t" + "%OI='11'\t" + "%M='31'\t" + "%OM='31'\t" + "%S='30.123'\t" + "%OS='30.123'\t" + "%p='PM'\t" + "%R='23:31'\t" + "%T='23:31:30.123'\t" + "%r='11:31:30 PM'\t" + "%X='23:31:30'\t" + "%EX='23:31:30'\t" + "\n"), + fmt, + std::chrono::utc_time( + 1'234'567'890'123ms + 24s)); // 23:31:30 UTC on Friday, 13 February 2009 + // Use the global locale (fr_FR) + check(SV("%H='00'\t" + "%OH='00'\t" + "%I='12'\t" + "%OI='12'\t" + "%M='00'\t" + "%OM='00'\t" + "%S='00'\t" + "%OS='00'\t" +#if defined(_AIX) + "%p='AM'\t" +#else + "%p=''\t" +#endif + "%R='00:00'\t" + "%T='00:00:00'\t" +#ifdef _WIN32 + "%r='00:00:00'\t" +#elif defined(_AIX) + "%r='12:00:00 AM'\t" +#elif defined(__APPLE__) || defined(__FreeBSD__) + "%r=''\t" +#else + "%r='12:00:00 '\t" +#endif + "%X='00:00:00'\t" + "%EX='00:00:00'\t" + "\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%H='23'\t" + "%OH='23'\t" + "%I='11'\t" + "%OI='11'\t" + "%M='31'\t" + "%OM='31'\t" + "%S='30,123'\t" + "%OS='30,123'\t" +#if defined(_AIX) + "%p='PM'\t" +#else + "%p=''\t" +#endif + "%R='23:31'\t" + "%T='23:31:30,123'\t" +#ifdef _WIN32 + "%r='23:31:30'\t" +#elif defined(_AIX) + "%r='11:31:30 PM'\t" +#elif defined(__APPLE__) || defined(__FreeBSD__) + "%r=''\t" +#else + "%r='11:31:30 '\t" +#endif + "%X='23:31:30'\t" + "%EX='23:31:30'\t" + "\n"), + lfmt, + std::chrono::utc_time( + 1'234'567'890'123ms + 24s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use supplied locale (ja_JP). This locale has a different alternate. +#if defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__) + check(loc, + SV("%H='00'\t" + "%OH='00'\t" + "%I='12'\t" + "%OI='12'\t" + "%M='00'\t" + "%OM='00'\t" + "%S='00'\t" + "%OS='00'\t" +# if defined(__APPLE__) + "%p='AM'\t" +# else + "%p='午前'\t" +# endif + "%R='00:00'\t" + "%T='00:00:00'\t" +# if defined(__APPLE__) || defined(__FreeBSD__) +# if defined(__APPLE__) + "%r='12:00:00 AM'\t" +# else + "%r='12:00:00 午前'\t" +# endif + "%X='00時00分00秒'\t" + "%EX='00時00分00秒'\t" +# elif defined(_WIN32) + "%r='0:00:00'\t" + "%X='0:00:00'\t" + "%EX='0:00:00'\t" +# else + "%r='午前12:00:00'\t" + "%X='00:00:00'\t" + "%EX='00:00:00'\t" +# endif + "\n"), + lfmt, + std::chrono::hh_mm_ss(0s)); + + check(loc, + SV("%H='23'\t" + "%OH='23'\t" + "%I='11'\t" + "%OI='11'\t" + "%M='31'\t" + "%OM='31'\t" + "%S='30.123'\t" + "%OS='30.123'\t" +# if defined(__APPLE__) + "%p='PM'\t" +# else + "%p='午後'\t" +# endif + "%R='23:31'\t" + "%T='23:31:30.123'\t" +# if defined(__APPLE__) || defined(__FreeBSD__) +# if defined(__APPLE__) + "%r='11:31:30 PM'\t" +# else + "%r='11:31:30 午後'\t" +# endif + "%X='23時31分30秒'\t" + "%EX='23時31分30秒'\t" +# elif defined(_WIN32) + "%r='23:31:30'\t" + "%X='23:31:30'\t" + "%EX='23:31:30'\t" +# else + "%r='午後11:31:30'\t" + "%X='23:31:30'\t" + "%EX='23:31:30'\t" +# endif + "\n"), + lfmt, + std::chrono::hh_mm_ss(23h + 31min + 30s + 123ms)); +#else // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__) + check(loc, + SV("%H='00'\t" + "%OH='〇'\t" + "%I='12'\t" + "%OI='十二'\t" + "%M='00'\t" + "%OM='〇'\t" + "%S='00'\t" + "%OS='〇'\t" + "%p='午前'\t" + "%R='00:00'\t" + "%T='00:00:00'\t" + "%r='午前12時00分00秒'\t" + "%X='00時00分00秒'\t" + "%EX='00時00分00秒'\t" + "\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%H='23'\t" + "%OH='二十三'\t" + "%I='11'\t" + "%OI='十一'\t" + "%M='31'\t" + "%OM='三十一'\t" + "%S='30.123'\t" + "%OS='三十.123'\t" + "%p='午後'\t" + "%R='23:31'\t" + "%T='23:31:30.123'\t" + "%r='午後11時31分30秒'\t" + "%X='23時31分30秒'\t" + "%EX='23時31分30秒'\t" + "\n"), + lfmt, + std::chrono::utc_time( + 1'234'567'890'123ms + 24s)); // 23:31:30 UTC on Friday, 13 February 2009 +#endif // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_date_time() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%c='%c'%t%%Ec='%Ec'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%c='%c'%t%%Ec='%Ec'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%c='Thu Jan 1 00:00:00 1970'\t%Ec='Thu Jan 1 00:00:00 1970'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(SV("%c='Fri Feb 13 23:31:30 2009'\t%Ec='Fri Feb 13 23:31:30 2009'\n"), + fmt, + std::chrono::utc_seconds(1'234'567'890s + 24s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use the global locale (fr_FR) + check( +// https://sourceware.org/bugzilla/show_bug.cgi?id=24054 +#if defined(__powerpc__) && defined(__linux__) + SV("%c='jeu. 01 janv. 1970 00:00:00 UTC'\t%Ec='jeu. 01 janv. 1970 00:00:00 UTC'\n"), +#elif defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29 + SV("%c='jeu. 01 janv. 1970 00:00:00 GMT'\t%Ec='jeu. 01 janv. 1970 00:00:00 GMT'\n"), +#elif defined(_AIX) + SV("%c=' 1 janvier 1970 à 00:00:00 UTC'\t%Ec=' 1 janvier 1970 à 00:00:00 UTC'\n"), +#elif defined(__APPLE__) + SV("%c='Jeu 1 jan 00:00:00 1970'\t%Ec='Jeu 1 jan 00:00:00 1970'\n"), +#elif defined(_WIN32) + SV("%c='01/01/1970 00:00:00'\t%Ec='01/01/1970 00:00:00'\n"), +#elif defined(__FreeBSD__) + SV("%c='jeu. 1 janv. 00:00:00 1970'\t%Ec='jeu. 1 janv. 00:00:00 1970'\n"), +#else + SV("%c='jeu. 01 janv. 1970 00:00:00'\t%Ec='jeu. 01 janv. 1970 00:00:00'\n"), +#endif + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check( +// https://sourceware.org/bugzilla/show_bug.cgi?id=24054 +#if defined(__powerpc__) && defined(__linux__) + SV("%c='ven. 13 févr. 2009 23:31:30 UTC'\t%Ec='ven. 13 févr. 2009 23:31:30 UTC'\n"), +#elif defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29 + SV("%c='ven. 13 févr. 2009 23:31:30 GMT'\t%Ec='ven. 13 févr. 2009 23:31:30 GMT'\n"), +#elif defined(_AIX) + SV("%c='13 février 2009 à 23:31:30 UTC'\t%Ec='13 février 2009 à 23:31:30 UTC'\n"), +#elif defined(__APPLE__) + SV("%c='Ven 13 fév 23:31:30 2009'\t%Ec='Ven 13 fév 23:31:30 2009'\n"), +#elif defined(_WIN32) + SV("%c='13/02/2009 23:31:30'\t%Ec='13/02/2009 23:31:30'\n"), +#elif defined(__FreeBSD__) + SV("%c='ven. 13 févr. 23:31:30 2009'\t%Ec='ven. 13 févr. 23:31:30 2009'\n"), +#else + SV("%c='ven. 13 févr. 2009 23:31:30'\t%Ec='ven. 13 févr. 2009 23:31:30'\n"), +#endif + lfmt, + std::chrono::utc_seconds(1'234'567'890s + 24s)); // 23:31:30 UTC on Friday, 13 February 2009 + + // Use supplied locale (ja_JP). This locale has a different alternate.a +#if defined(__APPLE__) || defined(__FreeBSD__) + check(loc, + SV("%c='木 1/ 1 00:00:00 1970'\t%Ec='木 1/ 1 00:00:00 1970'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + check(loc, + SV("%c='金 2/13 23:31:30 2009'\t%Ec='金 2/13 23:31:30 2009'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#elif defined(_AIX) // defined(__APPLE__)|| defined(__FreeBSD__) + check(loc, + SV("%c='1970年01月 1日 00:00:00 UTC'\t%Ec='1970年01月 1日 00:00:00 UTC'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + check(loc, + SV("%c='2009年02月13日 23:31:30 UTC'\t%Ec='2009年02月13日 23:31:30 UTC'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#elif defined(_WIN32) // defined(__APPLE__)|| defined(__FreeBSD__) + check(loc, + SV("%c='1970/01/01 0:00:00'\t%Ec='1970/01/01 0:00:00'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + check(loc, + SV("%c='2009/02/13 23:31:30'\t%Ec='2009/02/13 23:31:30'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009 +#else // defined(__APPLE__)|| defined(__FreeBSD__) + check(loc, + SV("%c='1970年01月01日 00時00分00秒'\t%Ec='昭和45年01月01日 00時00分00秒'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + check(loc, + SV("%c='2009年02月13日 23時31分30秒'\t%Ec='平成21年02月13日 23時31分30秒'\n"), + lfmt, + std::chrono::utc_seconds(1'234'567'890s + 24s)); // 23:31:30 UTC on Friday, 13 February 2009 +#endif // defined(__APPLE__)|| defined(__FreeBSD__) + + std::locale::global(std::locale::classic()); +} + +template +static void test_valid_values_time_zone() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), + fmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + // Use the global locale (fr_FR) + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + // Use supplied locale (ja_JP). + check(loc, + SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), + lfmt, + std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 + + std::locale::global(std::locale::classic()); +} + +template +static void test_utc_transitions() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%F %T}"); + check(SV("1972-06-30 23:59:59"), fmt, std::chrono::utc_seconds(78'796'799s)); + check(SV("1972-06-30 23:59:60"), fmt, std::chrono::utc_seconds(78'796'800s)); + check(SV("1972-07-01 00:00:00"), fmt, std::chrono::utc_seconds(78'796'801s)); + + check(SV("1972-12-31 23:59:59"), fmt, std::chrono::utc_seconds(94'694'400s)); + check(SV("1972-12-31 23:59:60"), fmt, std::chrono::utc_seconds(94'694'401s)); + check(SV("1973-01-01 00:00:00"), fmt, std::chrono::utc_seconds(94'694'402s)); +} + +template +static void test_valid_values() { + test_valid_values_year(); + test_valid_values_month(); + test_valid_values_day(); + test_valid_values_weekday(); + test_valid_values_day_of_year(); + test_valid_values_week(); + test_valid_values_iso_8601_week(); + test_valid_values_date(); + test_valid_values_time(); + test_valid_values_date_time(); + test_valid_values_time_zone(); + + test_utc_transitions(); +} + +// In order to have the UTC seconds the number of leap seconds need to be +// included in the UTC time. The number of leap seconds for times far in the +// future are not yet known and may change in the future. +template +static void test() { + using namespace std::literals::chrono_literals; + + test_no_chrono_specs(); + test_valid_values(); + check_invalid_types( + {SV("a"), SV("A"), SV("b"), SV("B"), SV("c"), SV("C"), SV("d"), SV("D"), SV("e"), SV("F"), SV("g"), + SV("G"), SV("h"), SV("H"), SV("I"), SV("j"), SV("m"), SV("M"), SV("p"), SV("r"), SV("R"), SV("S"), + SV("T"), SV("u"), SV("U"), SV("V"), SV("w"), SV("W"), SV("x"), SV("X"), SV("y"), SV("Y"), SV("z"), + SV("Z"), SV("Ec"), SV("EC"), SV("Ex"), SV("EX"), SV("Ey"), SV("EY"), SV("Ez"), SV("Od"), SV("Oe"), SV("OH"), + SV("OI"), SV("Om"), SV("OM"), SV("OS"), SV("Ou"), SV("OU"), SV("OV"), SV("Ow"), SV("OW"), SV("Oy"), SV("Oz")}, + std::chrono::utc_seconds(0s)); + + check_exception("The format specifier expects a '%' or a '}'", SV("{:A"), std::chrono::utc_seconds(0s)); + check_exception("The chrono specifiers contain a '{'", SV("{:%%{"), std::chrono::utc_seconds(0s)); + check_exception("End of input while parsing a conversion specifier", SV("{:%"), std::chrono::utc_seconds(0s)); + check_exception("End of input while parsing the modifier E", SV("{:%E"), std::chrono::utc_seconds(0s)); + check_exception("End of input while parsing the modifier O", SV("{:%O"), std::chrono::utc_seconds(0s)); + + // Precision not allowed + check_exception("The format specifier expects a '%' or a '}'", SV("{:.3}"), std::chrono::utc_seconds(0s)); +} + +int main(int, char**) { + test(); + +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + test(); +#endif + + return 0; +} diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp index 52cfa2c81c21a..88a485a256c80 100644 --- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp +++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp @@ -150,9 +150,15 @@ void test_P1361() { assert_is_formattable(); assert_is_formattable, CharT>(); - //assert_is_formattable, CharT>(); +# if !defined(TEST_HAS_NO_EXPERIMENTAL_TZDB) && !defined(TEST_HAS_NO_TIME_ZONE_DATABASE) && \ + !defined(TEST_HAS_NO_FILESYSTEM) + assert_is_formattable, CharT>(); //assert_is_formattable, CharT>(); //assert_is_formattable, CharT>(); + +# endif // !defined(TEST_HAS_NO_EXPERIMENTAL_TZDB) && !defined(TEST_HAS_NO_TIME_ZONE_DATABASE) && + // !defined(TEST_HAS_NO_FILESYSTEM) + assert_is_formattable, CharT>(); assert_is_formattable, CharT>(); From 12f82fbe072382bb78ab1cbdd3fbeb8ed44cbc81 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 24 Jan 2025 10:01:02 -0800 Subject: [PATCH 032/432] [compiler-rt] Fix Windows test after profile summary change (#124318) Fix a Windows compiler-rt test that https://github.com/llvm/llvm-project/pull/105915 broke. --- compiler-rt/test/profile/Windows/binary-id.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/compiler-rt/test/profile/Windows/binary-id.c b/compiler-rt/test/profile/Windows/binary-id.c index dadc623b7af38..f115de431618b 100644 --- a/compiler-rt/test/profile/Windows/binary-id.c +++ b/compiler-rt/test/profile/Windows/binary-id.c @@ -62,6 +62,8 @@ int main() { // BINARY-ID-RAW-PROF-NEXT: Total functions: 3 // BINARY-ID-RAW-PROF-NEXT: Maximum function count: 1 // BINARY-ID-RAW-PROF-NEXT: Maximum internal block count: 0 +// BINARY-ID-RAW-PROF-NEXT: Total number of blocks: +// BINARY-ID-RAW-PROF-NEXT: Total count: // BINARY-ID-RAW-PROF-NEXT: Binary IDs: // BINARY-ID-RAW-PROF-NEXT: {{[0-9a-f]+}} @@ -69,6 +71,8 @@ int main() { // ONE-BINARY-ID-NEXT: Total functions: 3 // ONE-BINARY-ID-NEXT: Maximum function count: 3 // ONE-BINARY-ID-NEXT: Maximum internal block count: 0 +// ONE-BINARY-ID-NEXT: Total number of blocks: +// ONE-BINARY-ID-NEXT: Total count: // ONE-BINARY-ID-NEXT: Binary IDs: // ONE-BINARY-ID-NEXT: {{[0-9a-f]+}} @@ -76,6 +80,8 @@ int main() { // MULTI-BINARY-ID-NEXT: Total functions: 3 // MULTI-BINARY-ID-NEXT: Maximum function count: 1 // MULTI-BINARY-ID-NEXT: Maximum internal block count: 0 +// MULTI-BINARY-ID-NEXT: Total number of blocks: +// MULTI-BINARY-ID-NEXT: Total count: // MULTI-BINARY-ID-NEXT: Binary IDs: // MULTI-BINARY-ID-NEXT: {{[0-9a-f]+}} // MULTI-BINARY-ID-NEXT: {{[0-9a-f]+}} From 7293455cf292cfaa263ea04fc1bc2aee4ceab6a6 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Fri, 24 Jan 2025 10:02:15 -0800 Subject: [PATCH 033/432] [lldb] Add SBThread.selected_frame property (#123981) Adds a `selected_frame` property to `SBThread`. The setter accepts either a frame index (like `SetSelectedFrame`), or a frame object. Updates a few tests to make use of the new `selected_frame`. While doing so I noticed some of the usage could be cleaned up, so I did that too. --- lldb/bindings/interface/SBThreadExtensions.i | 9 +++++++++ .../commands/frame/recognizer/TestFrameRecognizer.py | 8 +++----- .../location-list-lookup/TestLocationListLookup.py | 4 ++-- .../TestStdFunctionRecognizer.py | 11 ++++------- lldb/test/API/lang/objc/print-obj/TestPrintObj.py | 9 +++------ 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/lldb/bindings/interface/SBThreadExtensions.i b/lldb/bindings/interface/SBThreadExtensions.i index 860a2d765a669..267faad9d651f 100644 --- a/lldb/bindings/interface/SBThreadExtensions.i +++ b/lldb/bindings/interface/SBThreadExtensions.i @@ -51,6 +51,14 @@ STRING_EXTENSION_OUTSIDE(SBThread) for idx in range(self.GetStopReasonDataCount()) ] + def set_selected_frame(self, frame): + if isinstance(frame, SBFrame): + if frame.thread != self: + raise ValueError("cannot select frame from different thread") + self.SetSelectedFrame(frame.idx) + else: + self.SetSelectedFrame(frame) + id = property(GetThreadID, None, doc='''A read only property that returns the thread ID as an integer.''') idx = property(GetIndexID, None, doc='''A read only property that returns the thread index ID as an integer. Thread index ID values start at 1 and increment as threads come and go and can be used to uniquely identify threads.''') return_value = property(GetStopReturnValue, None, doc='''A read only property that returns an lldb object that represents the return value from the last stop (lldb.SBValue) if we just stopped due to stepping out of a function.''') @@ -65,6 +73,7 @@ STRING_EXTENSION_OUTSIDE(SBThread) stop_reason_data = property(get_stop_reason_data, None, doc='''A read only property that returns the stop reason data as a list.''') is_suspended = property(IsSuspended, None, doc='''A read only property that returns a boolean value that indicates if this thread is suspended.''') is_stopped = property(IsStopped, None, doc='''A read only property that returns a boolean value that indicates if this thread is stopped but not exited.''') + selected_frame = property(GetSelectedFrame, set_selected_frame, doc='''A read/write property that gets and sets the selected frame of this SBThread.''') %} #endif } diff --git a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py index aa2a448087431..3e9dbfe6d8555 100644 --- a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py +++ b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py @@ -20,7 +20,7 @@ def test_frame_recognizer_1(self): target, process, thread, _ = lldbutil.run_to_name_breakpoint( self, "foo", exe_name=exe ) - frame = thread.GetSelectedFrame() + frame = thread.selected_frame # Clear internal & plugins recognizers that get initialized at launch self.runCmd("frame recognizer clear") @@ -166,7 +166,7 @@ def test_frame_recognizer_hiding(self): self.build() target, process, thread, _ = lldbutil.run_to_name_breakpoint(self, "nested") - frame = thread.GetSelectedFrame() + frame = thread.selected_frame # Sanity check. self.expect( @@ -229,7 +229,6 @@ def test_frame_recognizer_multi_symbol(self): target, process, thread, _ = lldbutil.run_to_name_breakpoint( self, "foo", exe_name=exe ) - frame = thread.GetSelectedFrame() self.expect( "frame recognizer info 0", @@ -239,7 +238,6 @@ def test_frame_recognizer_multi_symbol(self): target, process, thread, _ = lldbutil.run_to_name_breakpoint( self, "bar", exe_name=exe ) - frame = thread.GetSelectedFrame() self.expect( "frame recognizer info 0", @@ -374,7 +372,7 @@ def test_frame_recognizer_not_only_first_instruction(self): opts = lldb.SBVariablesOptions() opts.SetIncludeRecognizedArguments(True) - frame = thread.GetSelectedFrame() + frame = thread.selected_frame variables = frame.GetVariables(opts) self.assertEqual(variables.GetSize(), 2) diff --git a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py index 84033daff7730..a97f4fc5e3d79 100644 --- a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py +++ b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py @@ -25,7 +25,7 @@ def check_local_vars(self, process: lldb.SBProcess, check_expr: bool): # Find `bar` on the stack, then # make sure we can read out the local # variables (with both `frame var` and `expr`) - for f in process.GetSelectedThread().frames: + for f in process.selected_thread.frames: frame_name = f.GetDisplayFunctionName() if frame_name is not None and frame_name.startswith("Foo::bar"): argv = f.GetValueForVariablePath("argv").GetChildAtIndex(0) @@ -34,7 +34,7 @@ def check_local_vars(self, process: lldb.SBProcess, check_expr: bool): self.assertNotEqual(strm.GetData().find("a.out"), -1) if check_expr: - process.GetSelectedThread().SetSelectedFrame(f.idx) + process.selected_thread.selected_frame = f self.expect_expr("this", result_type="Foo *") @skipIf(oslist=["linux"], archs=["arm"]) diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py index 978bf2066e43b..f5d0ea41e3114 100644 --- a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py +++ b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py @@ -69,14 +69,14 @@ def test_up_down(self): (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( self, "// break here", lldb.SBFileSpec("main.cpp") ) - frame = thread.GetSelectedFrame() + frame = thread.selected_frame # up self.assertIn("foo", frame.GetFunctionName()) start_idx = frame.GetFrameID() i = 0 while i < thread.GetNumFrames(): self.expect("up") - frame = thread.GetSelectedFrame() + frame = thread.selected_frame if frame.GetFunctionName() == "main": break end_idx = frame.GetFrameID() @@ -86,7 +86,7 @@ def test_up_down(self): start_idx = frame.GetFrameID() for i in range(1, thread.GetNumFrames()): self.expect("down") - frame = thread.GetSelectedFrame() + frame = thread.selected_frame if "foo" in frame.GetFunctionName(): break end_idx = frame.GetFrameID() @@ -99,11 +99,8 @@ def test_api(self): (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( self, "// break here", lldb.SBFileSpec("main.cpp") ) - frame = thread.GetSelectedFrame() num_hidden = 0 - for i in range(1, thread.GetNumFrames()): - thread.SetSelectedFrame(i) - frame = thread.GetSelectedFrame() + for frame in thread.frames: if frame.IsHidden(): num_hidden += 1 diff --git a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py index 60fc4fbc51cee..3ad4a09b53206 100644 --- a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py +++ b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py @@ -69,12 +69,9 @@ def test_print_obj(self): # We want to traverse the frame to the one corresponding to blocked.m to # issue our 'po lock_me' command. - depth = other_thread.GetNumFrames() - for i in range(depth): - frame = other_thread.GetFrameAtIndex(i) - name = frame.GetFunctionName() - if name == "main": - other_thread.SetSelectedFrame(i) + for frame in other_thread.frames: + if frame.name == "main": + other_thread.selected_frame = frame if self.TraceOn(): print("selected frame:" + lldbutil.get_description(frame)) break From a9ad601f7c5486919d6fabc5dd3cb6e96f63ac61 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 24 Jan 2025 10:08:42 -0800 Subject: [PATCH 034/432] [RISCV] Use vrsub for select of add and sub of the same operands (#123400) If we have a (vselect c, a+b, a-b), we can combine this to a+(vselect c, b, -b). That by itself isn't hugely profitable, but if we reverse the select, we get a form which matches a masked vrsub.vi with zero. The result is that we can use a masked vrsub *before* the add instead of a masked add or sub. This doesn't change the critical path (since we already had the pass through on the masked second op), but does reduce register pressure since a, b, and (a+b) don't need to all be alive at once. In addition to the vselect form, we can also see the same pattern with a vector_shuffle encoding the vselect. I explored canonicalizing these to vselects instead, but that exposes several unrelated missing combines. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 89 +++++++++- .../RISCV/rvv/fixed-vectors-select-addsub.ll | 162 ++++++------------ 2 files changed, 139 insertions(+), 112 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 618fb28d3e9f9..5e5bc0819a10c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1535,7 +1535,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP, - ISD::VECTOR_SHUFFLE}); + ISD::VECTOR_SHUFFLE, ISD::VSELECT}); + if (Subtarget.hasVendorXTHeadMemPair()) setTargetDAGCombine({ISD::LOAD, ISD::STORE}); if (Subtarget.useRVVForFixedLengthVectors()) @@ -16874,6 +16875,53 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static bool matchSelectAddSub(SDValue TrueVal, SDValue FalseVal, bool &SwapCC) { + if (!TrueVal.hasOneUse() || !FalseVal.hasOneUse()) + return false; + + SwapCC = false; + if (TrueVal.getOpcode() == ISD::SUB && FalseVal.getOpcode() == ISD::ADD) { + std::swap(TrueVal, FalseVal); + SwapCC = true; + } + + if (TrueVal.getOpcode() != ISD::ADD || FalseVal.getOpcode() != ISD::SUB) + return false; + + SDValue A = FalseVal.getOperand(0); + SDValue B = FalseVal.getOperand(1); + // Add is commutative, so check both orders + return ((TrueVal.getOperand(0) == A && TrueVal.getOperand(1) == B) || + (TrueVal.getOperand(1) == A && TrueVal.getOperand(0) == B)); +} + +/// Convert vselect CC, (add a, b), (sub a, b) to add a, (vselect CC, -b, b). +/// This allows us match a vadd.vv fed by a masked vrsub, which reduces +/// register pressure over the add followed by masked vsub sequence. +static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue CC = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + bool SwapCC; + if (!matchSelectAddSub(TrueVal, FalseVal, SwapCC)) + return SDValue(); + + SDValue Sub = SwapCC ? TrueVal : FalseVal; + SDValue A = Sub.getOperand(0); + SDValue B = Sub.getOperand(1); + + // Arrange the select such that we can match a masked + // vrsub.vi to perform the conditional negate + SDValue NegB = DAG.getNegative(B, DL, VT); + if (!SwapCC) + CC = DAG.getLogicalNOT(DL, CC, CC->getValueType(0)); + SDValue NewB = DAG.getNode(ISD::VSELECT, DL, VT, CC, NegB, B); + return DAG.getNode(ISD::ADD, DL, VT, A, NewB); +} + static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { if (SDValue Folded = foldSelectOfCTTZOrCTLZ(N, DAG)) @@ -17153,20 +17201,48 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT.getSimpleVT(), StridedLoad); } -/// Custom legalize or to . This runs -/// during the combine phase before type legalization, and relies on -/// DAGCombine not undoing the transform if isShuffleMaskLegal returns false -/// for the source mask. static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const RISCVTargetLowering &TLI) { SDLoc DL(N); EVT VT = N->getValueType(0); const unsigned ElementSize = VT.getScalarSizeInBits(); + const unsigned NumElts = VT.getVectorNumElements(); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); ArrayRef Mask = cast(N)->getMask(); + MVT XLenVT = Subtarget.getXLenVT(); + + // Recognized a disguised select of add/sub. + bool SwapCC; + if (ShuffleVectorInst::isSelectMask(Mask, NumElts) && + matchSelectAddSub(V1, V2, SwapCC)) { + SDValue Sub = SwapCC ? V1 : V2; + SDValue A = Sub.getOperand(0); + SDValue B = Sub.getOperand(1); + + SmallVector MaskVals; + for (int MaskIndex : Mask) { + bool SelectMaskVal = (MaskIndex < (int)NumElts); + MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); + } + assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle"); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); + SDValue CC = DAG.getBuildVector(MaskVT, DL, MaskVals); + // Arrange the select such that we can match a masked + // vrsub.vi to perform the conditional negate + SDValue NegB = DAG.getNegative(B, DL, VT); + if (!SwapCC) + CC = DAG.getLogicalNOT(DL, CC, CC->getValueType(0)); + SDValue NewB = DAG.getNode(ISD::VSELECT, DL, VT, CC, NegB, B); + return DAG.getNode(ISD::ADD, DL, VT, A, NewB); + } + + // Custom legalize or to . This runs + // during the combine phase before type legalization, and relies on + // DAGCombine not undoing the transform if isShuffleMaskLegal returns false + // for the source mask. if (TLI.isTypeLegal(VT) || ElementSize <= Subtarget.getELen() || !isPowerOf2_64(ElementSize) || VT.getVectorNumElements() % 2 != 0 || VT.isFloatingPoint() || TLI.isShuffleMaskLegal(Mask, VT)) @@ -17183,7 +17259,6 @@ static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, Res); } - static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { @@ -17857,6 +17932,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return performTRUNCATECombine(N, DAG, Subtarget); case ISD::SELECT: return performSELECTCombine(N, DAG, Subtarget); + case ISD::VSELECT: + return performVSELECTCombine(N, DAG); case RISCVISD::CZERO_EQZ: case RISCVISD::CZERO_NEZ: { SDValue Val = N->getOperand(0); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll index ee9609992c049..318f38839851c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll @@ -9,9 +9,8 @@ define <1 x i32> @select_addsub_v1i32(<1 x i1> %cc, <1 x i32> %a, <1 x i32> %b) ; CHECK-LABEL: select_addsub_v1i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <1 x i32> %a, %b %add = add <1 x i32> %a, %b @@ -23,9 +22,8 @@ define <2 x i32> @select_addsub_v2i32(<2 x i1> %cc, <2 x i32> %a, <2 x i32> %b) ; CHECK-LABEL: select_addsub_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <2 x i32> %a, %b %add = add <2 x i32> %a, %b @@ -37,9 +35,8 @@ define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) ; CHECK-LABEL: select_addsub_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b %add = add <4 x i32> %a, %b @@ -51,9 +48,9 @@ define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, ; CHECK-LABEL: select_addsub_v4i32_select_swapped: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vsub.vv v10, v8, v9 -; CHECK-NEXT: vadd.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmnot.m v0, v0 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b %add = add <4 x i32> %a, %b @@ -65,9 +62,8 @@ define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4 ; CHECK-LABEL: select_addsub_v4i32_add_swapped: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vadd.vv v10, v9, v8 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b %add = add <4 x i32> %b, %a @@ -79,9 +75,9 @@ define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, < ; CHECK-LABEL: select_addsub_v4i32_both_swapped: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vsub.vv v10, v8, v9 -; CHECK-NEXT: vadd.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmnot.m v0, v0 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b %add = add <4 x i32> %b, %a @@ -93,12 +89,11 @@ define <4 x i32> @select_addsub_v4i32_sub_swapped(<4 x i1> %cc, <4 x i32> %a, <4 ; CHECK-LABEL: select_addsub_v4i32_sub_swapped: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vadd.vv v10, v9, v8 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret - %sub = sub <4 x i32> %a, %b - %add = add <4 x i32> %b, %a + %sub = sub <4 x i32> %b, %a + %add = add <4 x i32> %a, %b %res = select <4 x i1> %cc, <4 x i32> %sub, <4 x i32> %add ret <4 x i32> %res } @@ -107,9 +102,8 @@ define <8 x i32> @select_addsub_v8i32(<8 x i1> %cc, <8 x i32> %a, <8 x i32> %b) ; CHECK-LABEL: select_addsub_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vadd.vv v12, v8, v10 -; CHECK-NEXT: vsub.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %sub = sub <8 x i32> %a, %b %add = add <8 x i32> %a, %b @@ -121,9 +115,8 @@ define <16 x i32> @select_addsub_v16i32(<16 x i1> %cc, <16 x i32> %a, <16 x i32> ; CHECK-LABEL: select_addsub_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vadd.vv v16, v8, v12 -; CHECK-NEXT: vsub.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vrsub.vi v12, v12, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %sub = sub <16 x i32> %a, %b %add = add <16 x i32> %a, %b @@ -136,9 +129,8 @@ define <32 x i32> @select_addsub_v32i32(<32 x i1> %cc, <32 x i32> %a, <32 x i32> ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vadd.vv v24, v8, v16 -; CHECK-NEXT: vsub.vv v24, v8, v16, v0.t -; CHECK-NEXT: vmv.v.v v8, v24 +; CHECK-NEXT: vrsub.vi v16, v16, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %sub = sub <32 x i32> %a, %b %add = add <32 x i32> %a, %b @@ -153,62 +145,28 @@ define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vv v24, v8, v16 -; CHECK-NEXT: vsub.vv v24, v8, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vadd.vv v16, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsub.vv v16, v24, v8, v0.t +; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vrsub.vi v24, v24, 0, v0.t ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -224,9 +182,8 @@ define <8 x i64> @select_addsub_v8i64(<8 x i1> %cc, <8 x i64> %a, <8 x i64> %b) ; CHECK-LABEL: select_addsub_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vadd.vv v16, v8, v12 -; CHECK-NEXT: vsub.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vrsub.vi v12, v12, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %sub = sub <8 x i64> %a, %b %add = add <8 x i64> %a, %b @@ -238,9 +195,8 @@ define <8 x i16> @select_addsub_v8i16(<8 x i1> %cc, <8 x i16> %a, <8 x i16> %b) ; CHECK-LABEL: select_addsub_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <8 x i16> %a, %b %add = add <8 x i16> %a, %b @@ -252,9 +208,8 @@ define <8 x i8> @select_addsub_v8i8(<8 x i1> %cc, <8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: select_addsub_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <8 x i8> %a, %b %add = add <8 x i8> %a, %b @@ -278,9 +233,8 @@ define <8 x i2> @select_addsub_v8i2(<8 x i1> %cc, <8 x i2> %a, <8 x i2> %b) { ; CHECK-LABEL: select_addsub_v8i2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <8 x i2> %a, %b %add = add <8 x i2> %a, %b @@ -293,9 +247,8 @@ define <4 x i32> @select_addsub_v4i32_constmask(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b %add = add <4 x i32> %a, %b @@ -307,14 +260,13 @@ define <4 x i32> @select_addsub_v4i32_constmask2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_constmask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vadd.vv v10, v9, v8 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret - %sub = sub <4 x i32> %a, %b - %add = add <4 x i32> %b, %a - %res = select <4 x i1> , <4 x i32> %sub, <4 x i32> %add + %sub = sub <4 x i32> %b, %a + %add = add <4 x i32> %a, %b + %res = select <4 x i1> , <4 x i32> %add, <4 x i32> %sub ret <4 x i32> %res } @@ -324,9 +276,8 @@ define <4 x i32> @select_addsub_v4i32_as_shuffle(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b %add = add <4 x i32> %a, %b @@ -339,13 +290,12 @@ define <4 x i32> @select_addsub_v4i32_as_shuffle2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_as_shuffle2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vadd.vv v10, v8, v9 -; CHECK-NEXT: vsub.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %sub = sub <4 x i32> %b, %a %add = add <4 x i32> %a, %b - %res = shufflevector <4 x i32> %sub, <4 x i32> %add, <4 x i32> + %res = shufflevector <4 x i32> %add, <4 x i32> %sub, <4 x i32> ret <4 x i32> %res } From 544a3cb65b6b9b1455f9294d1764f47a7b8673b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 24 Jan 2025 10:09:38 -0800 Subject: [PATCH 035/432] [flang][cuda] Handle variable with initialization in device global pass (#124307) --- .../Optimizer/Transforms/CUFDeviceGlobal.cpp | 12 +++++------ .../Fir/CUDA/cuda-implicit-device-global.f90 | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 5ce39f99bbb12..7486dde0e281e 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -36,13 +36,11 @@ static void processAddrOfOp(fir::AddrOfOp addrOfOp, addrOfOp.getSymbol().getRootReference().getValue())) { // TO DO: limit candidates to non-scalars. Scalars appear to have been // folded in already. - if (globalOp.getConstant()) { - if (recurseInGlobal) - globalOp.walk([&](fir::AddrOfOp op) { - processAddrOfOp(op, symbolTable, candidates, recurseInGlobal); - }); - candidates.insert(globalOp); - } + if (recurseInGlobal) + globalOp.walk([&](fir::AddrOfOp op) { + processAddrOfOp(op, symbolTable, candidates, recurseInGlobal); + }); + candidates.insert(globalOp); } } diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 index 9b22ed86e419c..11866d871a607 100644 --- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 +++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 @@ -308,3 +308,24 @@ // Test that global used in device function are flagged with the correct // CHECK-DAG: fir.global linkonce_odr @_QM__mod1E.c.__builtin_c_devptr // CHECK-DAG: fir.global linkonce_odr @_QM__mod1E.dt.__builtin_c_devptr // CHECK-DAG: fir.global linkonce_odr @_QM__mod1E.n.__builtin_c_devptr + +// ----- + +// Variables with initialization are promoted to non constant global. +// +// attributes(global) subroutine kernel4() +// integer :: a = 4 +// end subroutine + +func.func @_QPkernel4() attributes {cuf.proc_attr = #cuf.cuda_proc} { + %0 = fir.address_of(@_QFkernel4Ea) : !fir.ref + return +} +fir.global internal @_QFkernel4Ea : i32 { + %c4_i32 = arith.constant 4 : i32 + fir.has_value %c4_i32 : i32 +} + +// CHECK-LABEL: fir.global internal @_QFkernel4Ea : i32 +// CHECK-LABEL: gpu.module @cuda_device_mod +// CHECK: fir.global internal @_QFkernel4Ea : i32 From d9b8120259a546ce7aa9f047566fef29479f59e8 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 24 Jan 2025 13:14:21 -0500 Subject: [PATCH 036/432] [lld/COFF] Fix -start-lib / -end-lib more after reviews.llvm.org/D116434 (#124294) This is a follow-up to #120452 in a way. Since lld/COFF does not yet insert all defined in an obj file before all undefineds (ELF and MachO do this, see #67445 and things linked from there), it's possible that: 1. We add an obj file a.obj 2. a.obj contains an undefined that's in b.obj, causing b.obj to be added 3. b.obj contains an undefined that's in a part of a.obj that's not yet in the symbol table, causing a recursive load of a.obj, which adds the symbols in there twice, leading to duplicate symbol errors. For normal archives, `ArchiveFile::addMember()` has a `seen` check to prevent this. For start-lib lazy objects, we can just check if the archive is still lazy at the recursive call. This bug is similar to issue #59162. (Eventually, we'll probably want to do what the MachO and ELF ports do.) Includes a test that caused duplicate symbol diagnostics before this code change. --- lld/COFF/InputFiles.cpp | 2 ++ lld/COFF/SymbolTable.cpp | 4 +++ lld/test/COFF/start-lib.ll | 69 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index fe1135db636cb..47faf70e099e1 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -151,6 +151,8 @@ void ArchiveFile::addMember(const Archive::Symbol &sym) { toCOFFString(symtab.ctx, sym)); // Return an empty buffer if we have already returned the same buffer. + // FIXME: Remove this once we resolve all defineds before all undefineds in + // ObjFile::initializeSymbols(). if (!seen.insert(c.getChildOffset()).second) return; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 32ea4a5b2e1fc..307bd4a0c9411 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -56,6 +56,10 @@ static void forceLazy(Symbol *s) { } case Symbol::Kind::LazyObjectKind: { InputFile *file = cast(s)->file; + // FIXME: Remove this once we resolve all defineds before all undefineds in + // ObjFile::initializeSymbols(). + if (!file->lazy) + return; file->lazy = false; file->symtab.ctx.driver.addFile(file); break; diff --git a/lld/test/COFF/start-lib.ll b/lld/test/COFF/start-lib.ll index a46147f21ccbb..134cdc2a6e1df 100644 --- a/lld/test/COFF/start-lib.ll +++ b/lld/test/COFF/start-lib.ll @@ -173,3 +173,72 @@ target triple = "x86_64-pc-windows-msvc" define void @baz() { ret void } + + +; Check cycles between symbols in two /start-lib files. +; If the links succeed and does not emit duplicate symbol diagnostics, +; that's enough. + +; RUN: llc -filetype=obj %t.dir/main3.ll -o %t-main3.obj +; RUN: llc -filetype=obj %t.dir/cycle1.ll -o %t-cycle1.obj +; RUN: llc -filetype=obj %t.dir/cycle2.ll -o %t-cycle2.obj +; RUN: opt -thinlto-bc %t.dir/main3.ll -o %t-main3.bc +; RUN: opt -thinlto-bc %t.dir/cycle1.ll -o %t-cycle1.bc +; RUN: opt -thinlto-bc %t.dir/cycle2.ll -o %t-cycle2.bc + +; RUN: lld-link -out:%t3.exe -entry:main \ +; RUN: %t-main3.obj %t-cycle1.obj %t-cycle2.obj +; RUN: lld-link -out:%t3.exe -entry:main \ +; RUN: %t-main3.obj /start-lib %t-cycle1.obj %t-cycle2.obj /end-lib +; RUN: lld-link -out:%t3.exe -entry:main \ +; RUN: /start-lib %t-cycle1.obj %t-cycle2.obj /end-lib %t-main3.obj + +; RUN: lld-link -out:%t3.exe -entry:main \ +; RUN: %t-main3.bc %t-cycle1.bc %t-cycle2.bc +; RUN: lld-link -out:%t3.exe -entry:main \ +; RUN: %t-main3.bc /start-lib %t-cycle1.bc %t-cycle2.bc /end-lib +; RUN: lld-link -out:%t3.exe -entry:main \ +; RUN: /start-lib %t-cycle1.bc %t-cycle2.bc /end-lib %t-main3.bc + +#--- main3.ll + +target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc" + +declare void @foo1() + +define void @main() { + call void () @foo1() + ret void +} + +#--- cycle1.ll + +target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc" + +declare void @bar() + +define void @foo1() { + ; cycle1.ll pulls in cycle2.ll for bar(), and cycle2.ll then pulls in + ; cycle1.ll again for foo2(). + call void () @bar() + ret void +} + +define void @foo2() { + ret void +} + + +#--- cycle2.ll + +target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc" + +declare void @foo2() + +define void @bar() { + call void () @foo2() + ret void +} From e4009ed3d68ba8d9e78721ce5afc2b3a7edd6f36 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 24 Jan 2025 19:17:51 +0100 Subject: [PATCH 037/432] [clang][docs] Update bytecode interpreter docs (#124252) Just a light update, not adding a lot of new information. --- clang/docs/ConstantInterpreter.rst | 119 ++++------------------------- clang/www/OpenProjects.html | 11 --- 2 files changed, 16 insertions(+), 114 deletions(-) diff --git a/clang/docs/ConstantInterpreter.rst b/clang/docs/ConstantInterpreter.rst index 0c5b09c73ee30..b08cb1ce353be 100644 --- a/clang/docs/ConstantInterpreter.rst +++ b/clang/docs/ConstantInterpreter.rst @@ -18,8 +18,8 @@ by the evaluator. The interpreter is activated using the following flags: Bytecode Compilation ==================== -Bytecode compilation is handled in ``ByteCodeStmtGen.h`` for statements -and ``ByteCodeExprGen.h`` for expressions. The compiler has two different +Bytecode compilation is handled in ``Compiler.h`` for statements +and for expressions. The compiler has two different backends: one to generate bytecode for functions (``ByteCodeEmitter``) and one to directly evaluate expressions as they are compiled, without generating bytecode (``EvalEmitter``). All functions are compiled to @@ -44,11 +44,11 @@ Primitive Types Signed or unsigned integers of a specific bit width, implemented using the ```Integral``` type. -* ``PT_{U|S}intFP`` +* ``PT_IntAP{S}`` Signed or unsigned integers of an arbitrary, but fixed width used to implement integral types which are required by the target, but are not - supported by the host. Under the hood, they rely on APValue. The + supported by the host. Under the hood, they rely on ``APInt``. The ``Integral`` specialisation for these types is required by opcodes to share an implementation with fixed integrals. @@ -57,7 +57,7 @@ Primitive Types Representation for boolean types, essentially a 1-bit unsigned ``Integral``. -* ``PT_RealFP`` +* ``PT_Float`` Arbitrary, but fixed precision floating point numbers. Could be specialised in the future similarly to integers in order to improve @@ -65,30 +65,21 @@ Primitive Types * ``PT_Ptr`` - Pointer type, defined in ``"Pointer.h"``. A pointer can be either null, - reference interpreter-allocated memory (``BlockPointer``) or point to an - address which can be derived, but not accessed (``ExternPointer``). + Pointer type, defined in ``"Pointer.h"``. The most common type of + pointer is a "BlockPointer", which points to an ``interp::Block``. + But other pointer types exist, such as typeid pointers or + integral pointers. * ``PT_FnPtr`` Function pointer type, can also be a null function pointer. Defined - in ``"FnPointer.h"``. + in ``"FunctionPointer.h"``. -* ``PT_MemPtr`` +* ``PT_MemberPtr`` Member pointer type, can also be a null member pointer. Defined in ``"MemberPointer.h"`` -* ``PT_VoidPtr`` - - Void pointer type, can be used for round-trip casts. Represented as - the union of all pointers which can be cast to void. - Defined in ``"VoidPointer.h"``. - -* ``PT_ObjCBlockPtr`` - - Pointer type for ObjC blocks. Defined in ``"ObjCBlockPointer.h"``. - Composite types --------------- @@ -219,35 +210,21 @@ Pointers -------- Pointers, implemented in ``Pointer.h`` are represented as a tagged union. -Some of these may not yet be available in upstream ``clang``. * **BlockPointer**: used to reference memory allocated and managed by the interpreter, being the only pointer kind which allows dereferencing in the interpreter - * **ExternPointer**: points to memory which can be addressed, but not read by - the interpreter. It is equivalent to APValue, tracking a declaration and a path - of fields and indices into that allocation. - * **TargetPointer**: represents a target address derived from a base address - through pointer arithmetic, such as ``((int *)0x100)[20]``. Null pointers are - target pointers with a zero offset. - * **TypeInfoPointer**: tracks information for the opaque type returned by + * **TypeIDPointer**: tracks information for the opaque type returned by ``typeid`` - * **InvalidPointer**: is dummy pointer created by an invalid operation which - allows the interpreter to continue execution. Does not allow pointer - arithmetic or dereferencing. + * **IntegralPointer**: a pointer formed from an integer, + think ``(int*)123``. Besides the previously mentioned union, a number of other pointer-like types have their own type: - * **ObjCBlockPointer** tracks Objective-C blocks - * **FnPointer** tracks functions and lazily caches their compiled version + * **FunctionPointer** tracks functions. * **MemberPointer** tracks C++ object members -Void pointers, which can be built by casting any of the aforementioned -pointers, are implemented as a union of all pointer types. The ``BitCast`` -opcode is responsible for performing all legal conversions between these -types and primitive integers. - BlockPointer ~~~~~~~~~~~~ @@ -311,73 +288,9 @@ of ``a.c``, but its offset would point to ``&a.c[1]``. The array-to-pointer decay operation adjusts a pointer to an array (where the offset is equal to the base) to a pointer to the first element. -ExternPointer -~~~~~~~~~~~~~ - -Extern pointers can be derived, pointing into symbols which are not -readable from constexpr. An external pointer consists of a base -declaration, along with a path designating a subobject, similar to -the ``LValuePath`` of an APValue. Extern pointers can be converted -to block pointers if the underlying variable is defined after the -pointer is created, as is the case in the following example: - -.. code-block:: c - - extern const int a; - constexpr const int *p = &a; - const int a = 5; - static_assert(*p == 5, "x"); - -TargetPointer -~~~~~~~~~~~~~ - -While null pointer arithmetic or integer-to-pointer conversion is -banned in constexpr, some expressions on target offsets must be folded, -replicating the behaviour of the ``offsetof`` builtin. Target pointers -are characterised by 3 offsets: a field offset, an array offset and a -base offset, along with a descriptor specifying the type the pointer is -supposed to refer to. Array indexing adjusts the array offset, while the -field offset is adjusted when a pointer to a member is created. Casting -an integer to a pointer sets the value of the base offset. As a special -case, null pointers are target pointers with all offsets set to 0. - TypeInfoPointer ~~~~~~~~~~~~~~~ ``TypeInfoPointer`` tracks two types: the type assigned to ``std::type_info`` and the type which was passed to ``typeinfo``. - -InvalidPointer -~~~~~~~~~~~~~~ - -Such pointers are built by operations which cannot generate valid -pointers, allowing the interpreter to continue execution after emitting -a warning. Inspecting such a pointer stops execution. - -TODO -==== - -Missing Language Features -------------------------- - -* Changing the active field of unions -* ``volatile`` -* ``__builtin_constant_p`` -* ``dynamic_cast`` -* ``new`` and ``delete`` -* Fixed Point numbers and arithmetic on Complex numbers -* Several builtin methods, including string operations and - ``__builtin_bit_cast`` -* Continue-after-failure: a form of exception handling at the bytecode - level should be implemented to allow execution to resume. As an example, - argument evaluation should resume after the computation of an argument fails. -* Pointer-to-Integer conversions -* Lazy descriptors: the interpreter creates a ``Record`` and ``Descriptor`` - when it encounters a type: ones which are not yet defined should be lazily - created when required - -Known Bugs ----------- - -* If execution fails, memory storing APInts and APFloats is leaked when the - stack is cleared +It is part of the taged union in ``Pointer``. diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html index d48b3bebe7611..a9efdb8d762d7 100755 --- a/clang/www/OpenProjects.html +++ b/clang/www/OpenProjects.html @@ -90,17 +90,6 @@

Open Clang Projects

performance as well as to find ways to proactively alert us when we've introduced a change that has significant negative impact on build times. -
  • Complete support for the experimental constant expression interpreter -: Clang's production constant expression interpreter computes a constant -expression result by walking over AST nodes, performing calculations as it -goes. This does not have good performance properties, and so we've begun work -on an -experimental constant expression interpreter that works by converting the -AST into bytecode that is interpreted. This effort has a long tail of work left -to complete because it requires implementing byte code for every kind of -expression and type that can be used in a constant expression for C++ and C. -
  • -
  • Improve clang-doc: Clang's library-based design allows it to be used by a variety of tools that reason about source code. clang-doc is one From 825e712959d48f14b47e579871bcf9b5e25fff7a Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Fri, 24 Jan 2025 10:24:09 -0800 Subject: [PATCH 038/432] [HLSL] cbuffer: create host layout structs (#122820) Creates layout struct for `cbuffer` in Sema which will contains only declarations contributing to the constant buffer layout. Anything else will be filtered out, such as static variables decls, struct and function definitions, resources, or empty struct and zero-sized arrays. If the constant buffer includes a struct that contains any of the above undesirable declarations, a new version of this struct should be created with these declarations filtered out as well. The definition of buffer layout struct will be added to the HLSLBufferDecl AST node as the last node. Any layout structs for embedded structures will be added there as well. Fixes #122553 --- clang/lib/Sema/SemaHLSL.cpp | 239 ++++++++++++++++++ ... => ast-dump-comment-cbuffer-tbuffer.hlsl} | 39 +-- clang/test/AST/HLSL/cbuffer.hlsl | 209 +++++++++++++++ .../test/AST/HLSL/cbuffer_and_namespaces.hlsl | 98 +++++++ clang/test/AST/HLSL/cbuffer_tbuffer.hlsl | 26 -- clang/test/AST/HLSL/pch_hlsl_buffer.hlsl | 39 +-- 6 files changed, 592 insertions(+), 58 deletions(-) rename clang/test/AST/HLSL/{ast-dump-comment-cbuffe-tbufferr.hlsl => ast-dump-comment-cbuffer-tbuffer.hlsl} (50%) create mode 100644 clang/test/AST/HLSL/cbuffer.hlsl create mode 100644 clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl delete mode 100644 clang/test/AST/HLSL/cbuffer_tbuffer.hlsl diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 5001883003ee2..f26469e6a2f1d 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -15,12 +15,14 @@ #include "clang/AST/Decl.h" #include "clang/AST/DeclBase.h" #include "clang/AST/DeclCXX.h" +#include "clang/AST/DeclarationName.h" #include "clang/AST/DynamicRecursiveASTVisitor.h" #include "clang/AST/Expr.h" #include "clang/AST/Type.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticSema.h" +#include "clang/Basic/IdentifierTable.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/TargetInfo.h" @@ -32,16 +34,21 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/Support/Casting.h" #include "llvm/Support/DXILABI.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" +#include #include #include using namespace clang; using RegisterType = HLSLResourceBindingAttr::RegisterType; +static CXXRecordDecl *createHostLayoutStruct(Sema &S, + CXXRecordDecl *StructDecl); + static RegisterType getRegisterType(ResourceClass RC) { switch (RC) { case ResourceClass::SRV: @@ -253,12 +260,244 @@ static void validatePackoffset(Sema &S, HLSLBufferDecl *BufDecl) { } } +// Returns true if the array has a zero size = if any of the dimensions is 0 +static bool isZeroSizedArray(const ConstantArrayType *CAT) { + while (CAT && !CAT->isZeroSize()) + CAT = dyn_cast( + CAT->getElementType()->getUnqualifiedDesugaredType()); + return CAT != nullptr; +} + +// Returns true if the record type is an HLSL resource class +static bool isResourceRecordType(const Type *Ty) { + return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr; +} + +// Returns true if the type is a leaf element type that is not valid to be +// included in HLSL Buffer, such as a resource class, empty struct, zero-sized +// array, or a builtin intangible type. Returns false it is a valid leaf element +// type or if it is a record type that needs to be inspected further. +static bool isInvalidConstantBufferLeafElementType(const Type *Ty) { + if (Ty->isRecordType()) { + if (isResourceRecordType(Ty) || Ty->getAsCXXRecordDecl()->isEmpty()) + return true; + return false; + } + if (Ty->isConstantArrayType() && + isZeroSizedArray(cast(Ty))) + return true; + if (Ty->isHLSLBuiltinIntangibleType()) + return true; + return false; +} + +// Returns true if the struct contains at least one element that prevents it +// from being included inside HLSL Buffer as is, such as an intangible type, +// empty struct, or zero-sized array. If it does, a new implicit layout struct +// needs to be created for HLSL Buffer use that will exclude these unwanted +// declarations (see createHostLayoutStruct function). +static bool requiresImplicitBufferLayoutStructure(const CXXRecordDecl *RD) { + if (RD->getTypeForDecl()->isHLSLIntangibleType() || RD->isEmpty()) + return true; + // check fields + for (const FieldDecl *Field : RD->fields()) { + QualType Ty = Field->getType(); + if (isInvalidConstantBufferLeafElementType(Ty.getTypePtr())) + return true; + if (Ty->isRecordType() && + requiresImplicitBufferLayoutStructure(Ty->getAsCXXRecordDecl())) + return true; + } + // check bases + for (const CXXBaseSpecifier &Base : RD->bases()) + if (requiresImplicitBufferLayoutStructure( + Base.getType()->getAsCXXRecordDecl())) + return true; + return false; +} + +static CXXRecordDecl *findRecordDeclInContext(IdentifierInfo *II, + DeclContext *DC) { + CXXRecordDecl *RD = nullptr; + for (NamedDecl *Decl : + DC->getNonTransparentContext()->lookup(DeclarationName(II))) { + if (CXXRecordDecl *FoundRD = dyn_cast(Decl)) { + assert(RD == nullptr && + "there should be at most 1 record by a given name in a scope"); + RD = FoundRD; + } + } + return RD; +} + +// Creates a name for buffer layout struct using the provide name base. +// If the name must be unique (not previously defined), a suffix is added +// until a unique name is found. +static IdentifierInfo *getHostLayoutStructName(Sema &S, NamedDecl *BaseDecl, + bool MustBeUnique) { + ASTContext &AST = S.getASTContext(); + + IdentifierInfo *NameBaseII = BaseDecl->getIdentifier(); + llvm::SmallString<64> Name("__layout_"); + if (NameBaseII) { + Name.append(NameBaseII->getName()); + } else { + // anonymous struct + Name.append("anon"); + MustBeUnique = true; + } + + size_t NameLength = Name.size(); + IdentifierInfo *II = &AST.Idents.get(Name, tok::TokenKind::identifier); + if (!MustBeUnique) + return II; + + unsigned suffix = 0; + while (true) { + if (suffix != 0) { + Name.append("_"); + Name.append(llvm::Twine(suffix).str()); + II = &AST.Idents.get(Name, tok::TokenKind::identifier); + } + if (!findRecordDeclInContext(II, BaseDecl->getDeclContext())) + return II; + // declaration with that name already exists - increment suffix and try + // again until unique name is found + suffix++; + Name.truncate(NameLength); + }; +} + +// Creates a field declaration of given name and type for HLSL buffer layout +// struct. Returns nullptr if the type cannot be use in HLSL Buffer layout. +static FieldDecl *createFieldForHostLayoutStruct(Sema &S, const Type *Ty, + IdentifierInfo *II, + CXXRecordDecl *LayoutStruct) { + if (isInvalidConstantBufferLeafElementType(Ty)) + return nullptr; + + if (Ty->isRecordType()) { + CXXRecordDecl *RD = Ty->getAsCXXRecordDecl(); + if (requiresImplicitBufferLayoutStructure(RD)) { + RD = createHostLayoutStruct(S, RD); + if (!RD) + return nullptr; + Ty = RD->getTypeForDecl(); + } + } + + QualType QT = QualType(Ty, 0); + ASTContext &AST = S.getASTContext(); + TypeSourceInfo *TSI = AST.getTrivialTypeSourceInfo(QT, SourceLocation()); + auto *Field = FieldDecl::Create(AST, LayoutStruct, SourceLocation(), + SourceLocation(), II, QT, TSI, nullptr, false, + InClassInitStyle::ICIS_NoInit); + Field->setAccess(AccessSpecifier::AS_private); + return Field; +} + +// Creates host layout struct for a struct included in HLSL Buffer. +// The layout struct will include only fields that are allowed in HLSL buffer. +// These fields will be filtered out: +// - resource classes +// - empty structs +// - zero-sized arrays +// Returns nullptr if the resulting layout struct would be empty. +static CXXRecordDecl *createHostLayoutStruct(Sema &S, + CXXRecordDecl *StructDecl) { + assert(requiresImplicitBufferLayoutStructure(StructDecl) && + "struct is already HLSL buffer compatible"); + + ASTContext &AST = S.getASTContext(); + DeclContext *DC = StructDecl->getDeclContext(); + IdentifierInfo *II = getHostLayoutStructName(S, StructDecl, false); + + // reuse existing if the layout struct if it already exists + if (CXXRecordDecl *RD = findRecordDeclInContext(II, DC)) + return RD; + + CXXRecordDecl *LS = CXXRecordDecl::Create( + AST, TagDecl::TagKind::Class, DC, SourceLocation(), SourceLocation(), II); + LS->setImplicit(true); + LS->startDefinition(); + + // copy base struct, create HLSL Buffer compatible version if needed + if (unsigned NumBases = StructDecl->getNumBases()) { + assert(NumBases == 1 && "HLSL supports only one base type"); + CXXBaseSpecifier Base = *StructDecl->bases_begin(); + CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl(); + if (requiresImplicitBufferLayoutStructure(BaseDecl)) { + BaseDecl = createHostLayoutStruct(S, BaseDecl); + if (BaseDecl) { + TypeSourceInfo *TSI = AST.getTrivialTypeSourceInfo( + QualType(BaseDecl->getTypeForDecl(), 0)); + Base = CXXBaseSpecifier(SourceRange(), false, StructDecl->isClass(), + AS_none, TSI, SourceLocation()); + } + } + if (BaseDecl) { + const CXXBaseSpecifier *BasesArray[1] = {&Base}; + LS->setBases(BasesArray, 1); + } + } + + // filter struct fields + for (const FieldDecl *FD : StructDecl->fields()) { + const Type *Ty = FD->getType()->getUnqualifiedDesugaredType(); + if (FieldDecl *NewFD = + createFieldForHostLayoutStruct(S, Ty, FD->getIdentifier(), LS)) + LS->addDecl(NewFD); + } + LS->completeDefinition(); + + if (LS->field_empty() && LS->getNumBases() == 0) + return nullptr; + + DC->addDecl(LS); + return LS; +} + +// Creates host layout struct for HLSL Buffer. The struct will include only +// fields of types that are allowed in HLSL buffer and it will filter out: +// - static variable declarations +// - resource classes +// - empty structs +// - zero-sized arrays +// - non-variable declarations +// The layour struct will be added to the HLSLBufferDecl declarations. +void createHostLayoutStructForBuffer(Sema &S, HLSLBufferDecl *BufDecl) { + ASTContext &AST = S.getASTContext(); + IdentifierInfo *II = getHostLayoutStructName(S, BufDecl, true); + + CXXRecordDecl *LS = + CXXRecordDecl::Create(AST, TagDecl::TagKind::Class, BufDecl, + SourceLocation(), SourceLocation(), II); + LS->setImplicit(true); + LS->startDefinition(); + + for (const Decl *D : BufDecl->decls()) { + const VarDecl *VD = dyn_cast(D); + if (!VD || VD->getStorageClass() == SC_Static) + continue; + const Type *Ty = VD->getType()->getUnqualifiedDesugaredType(); + if (FieldDecl *FD = + createFieldForHostLayoutStruct(S, Ty, VD->getIdentifier(), LS)) + LS->addDecl(FD); + } + LS->completeDefinition(); + BufDecl->addDecl(LS); +} + +// Handle end of cbuffer/tbuffer declaration void SemaHLSL::ActOnFinishBuffer(Decl *Dcl, SourceLocation RBrace) { auto *BufDecl = cast(Dcl); BufDecl->setRBraceLoc(RBrace); validatePackoffset(SemaRef, BufDecl); + // create buffer layout struct + createHostLayoutStructForBuffer(SemaRef, BufDecl); + SemaRef.PopDeclContext(); } diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl similarity index 50% rename from clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl rename to clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl index e6a2ea7c6d2dc..0bff3ae144037 100644 --- a/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl +++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl @@ -37,19 +37,26 @@ tbuffer B { int d; } -// AST:HLSLBufferDecl {{.*}}:11:1, line:20:1> line:11:9 cbuffer A -// AST-NEXT:-HLSLResourceClassAttr {{.*}} <> Implicit CBuffer -// AST-NEXT:-HLSLResourceAttr {{.*}} <> Implicit CBuffer -// AST-NEXT:FullComment {{.*}} -// AST-NEXT:`-ParagraphComment {{.*}} -// AST-NEXT:`-TextComment {{.*}} Text=" CBuffer decl." -// AST-NEXT:-VarDecl {{.*}} col:11 a 'float' -// AST-NEXT:`-VarDecl {{.*}} col:9 b 'int' -// AST-NEXT:HLSLBufferDecl {{.*}} line:29:9 tbuffer B -// AST-NEXT:-HLSLResourceClassAttr {{.*}} <> Implicit SRV -// AST-NEXT:-HLSLResourceAttr {{.*}} <> Implicit TBuffer -// AST-NEXT:-FullComment {{.*}} -// AST-NEXT: `-ParagraphComment {{.*}} -// AST-NEXT: `-TextComment {{.*}} Text=" TBuffer decl." -// AST-NEXT:-VarDecl {{.*}} col:11 c 'float' -// AST-NEXT:`-VarDecl {{.*}} col:9 d 'int' +// AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A +// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// AST-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer +// AST-NEXT: FullComment +// AST-NEXT: ParagraphComment +// AST-NEXT: TextComment {{.*}} Text=" CBuffer decl." +// AST-NEXT: VarDecl {{.*}} a 'float' +// AST-NEXT: VarDecl {{.*}} b 'int' +// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_A definition +// AST: FieldDecl {{.*}} a 'float' +// AST-NEXT: FieldDecl {{.*}} b 'int' + +// AST-NEXT: HLSLBufferDecl {{.*}} line:29:9 tbuffer B +// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV +// AST-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer +// AST-NEXT: FullComment +// AST-NEXT: ParagraphComment +// AST-NEXT: TextComment {{.*}} Text=" TBuffer decl." +// AST-NEXT: VarDecl {{.*}} c 'float' +// AST-NEXT: VarDecl {{.*}} d 'int' +// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_B definition +// AST: FieldDecl {{.*}} c 'float' +// AST-NEXT: FieldDecl {{.*}} d 'int' diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl new file mode 100644 index 0000000000000..721abb290f163 --- /dev/null +++ b/clang/test/AST/HLSL/cbuffer.hlsl @@ -0,0 +1,209 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s + +struct EmptyStruct { +}; + +struct A { + float a; +}; + +struct B { + RWBuffer buf; + EmptyStruct es; + float ea[0]; + float a; +}; + +struct C { + EmptyStruct es; +}; + +typedef B BTypedef; +typedef C CTypedef; + +struct D : B { + float b; +}; + +struct E : EmptyStruct { + float c; +}; + +struct F : A { + int ae[0]; +}; + +typedef float EmptyArrayTypedef[10][0]; + +struct OneFloat { + float a; +}; + +struct TwoFloats { + float a; + float b; +}; + +// CHECK: HLSLBufferDecl {{.*}} line:50:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: VarDecl {{.*}} col:9 used a1 'float' + float a1; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB definition + // CHECK: FieldDecl {{.*}} a1 'float' +} +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_CB), ""); + +// Check that buffer layout struct does not include resources or empty types +// CHECK: HLSLBufferDecl {{.*}} line:62:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: VarDecl {{.*}} col:9 used a2 'float' + float a2; + // CHECK: VarDecl {{.*}} col:19 b2 'RWBuffer':'hlsl::RWBuffer' + RWBuffer b2; + // CHECK: VarDecl {{.*}} col:15 c2 'EmptyStruct' + EmptyStruct c2; + // CHECK: VarDecl {{.*}} col:9 d2 'float[0]' + float d2[0]; + // CHECK: VarDecl {{.*}} col:9 e2 'float' + float e2; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_1 definition + // CHECK: FieldDecl {{.*}} a2 'float' + // CHECK-NEXT: FieldDecl {{.*}} e2 'float' +} +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_1), ""); + +// Check that layout struct is created for B and the empty struct C is removed +// CHECK: HLSLBufferDecl {{.*}} line:83:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: VarDecl {{.*}} col:5 used s1 'A' + A s1; + // CHECK: VarDecl {{.*}} col:5 s2 'B' + B s2; + // CHECK: VarDecl {{.*}} col:12 s3 'CTypedef':'C + CTypedef s3; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_2 definition + // CHECK: FieldDecl {{.*}} s1 'A' + // CHECK: FieldDecl {{.*}} s2 '__layout_B' +} +// CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_B definition +// CHECK: FieldDecl {{.*}} a 'float' + +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_B), ""); +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_2), ""); + +// check that layout struct is created for D because of its base struct +// CHECK: HLSLBufferDecl {{.*}} line:104:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: VarDecl {{.*}} s4 'D' + D s4; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_3 definition + // CHECK: FieldDecl {{.*}} s4 '__layout_D' +} + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_D definition + // CHECK: public '__layout_B' + // CHECK: FieldDecl {{.*}} b 'float' +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_D), ""); +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_3), ""); + +// check that layout struct is created for E because because its base struct +// is empty and should be eliminated, and BTypedef should reuse the previously +// defined '__layout_B' +// CHECK: HLSLBufferDecl {{.*}} line:122:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: VarDecl {{.*}} s5 'E' + E s5; + // CHECK: VarDecl {{.*}} s6 'BTypedef':'B' + BTypedef s6; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_4 definition + // CHECK: FieldDecl {{.*}} s5 '__layout_E' + // CHECK: FieldDecl {{.*}} s6 '__layout_B' +} + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_E definition + // CHECK: FieldDecl {{.*}} c 'float' + // CHECK-NOT: CXXRecordDecl {{.*}} class __layout_B definition +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_E), ""); +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_4), ""); + +// check that this produces empty layout struct +// CHECK: HLSLBufferDecl {{.*}} line:141:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: FunctionDecl {{.*}} f 'void ()' + void f() {} + // CHECK: VarDecl {{.*}} SV 'float' static + static float SV; + // CHECK: VarDecl {{.*}} s7 'EmptyStruct' callinit + EmptyStruct s7; + // CHECK: VarDecl {{.*}} Buf 'RWBuffer':'hlsl::RWBuffer' callinit + RWBuffer Buf; + // CHECK: VarDecl {{.*}} ea 'EmptyArrayTypedef':'float[10][0]' + EmptyArrayTypedef ea; + // CHECK: CXXRecordDecl {{.*}} implicit class __layout_CB_5 definition + // CHECK-NOT: FieldDecl +} + +// check host layout struct with compatible base struct +// CHECK: HLSLBufferDecl {{.*}} line:160:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: VarDecl {{.*}} s8 'F' + F s8; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_6 definition + // CHECK: FieldDecl {{.*}} s8 '__layout_F' +} + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_F definition + // CHECK: public 'A' +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_F), ""); +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_CB_6), ""); + +// anonymous structs +// CHECK: HLSLBufferDecl {{.*}} line:175:9 cbuffer CB +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB { + // CHECK: CXXRecordDecl {{.*}} struct definition + struct { + // CHECK: FieldDecl {{.*}} e 'float' + float e; + // CHECK: FieldDecl {{.*}} c 'int[0][1]' + int c[0][1]; + // CHECK: FieldDecl {{.*}} f 'RWBuffer':'hlsl::RWBuffer' + RWBuffer f; + } s9; + // CHECK: VarDecl {{.*}} s9 'struct (unnamed struct at {{.*}}cbuffer.hlsl:177:3 + // CHECK: CXXRecordDecl {{.*}} struct definition + struct { + // CHECK: FieldDecl {{.*}} g 'float' + float g; + // CHECK: FieldDecl {{.*}} f 'RWBuffer':'hlsl::RWBuffer' + RWBuffer f; + } s10; + // CHECK: VarDecl {{.*}} s10 'struct (unnamed struct at {{.*}}cbuffer.hlsl:187:3 + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon definition + // CHECK: FieldDecl {{.*}} e 'float' + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon_1 definition + // CHECK: FieldDecl {{.*}} g 'float' + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_7 definition + // CHECK: FieldDecl {{.*}} s9 '__layout_anon' + // CHECK: FieldDecl {{.*}} s10 '__layout_anon_1' +} +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_anon), ""); +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_anon_1), ""); +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_7), ""); + +// Add uses for the constant buffer declarations so they are not optimized away +export float foo() { + return a1 + a2 + s1.a + s4.b + s5.c + s8.a + s9.e; +} diff --git a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl new file mode 100644 index 0000000000000..4b1bbea736f85 --- /dev/null +++ b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl @@ -0,0 +1,98 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl {{.*}} struct EmptyStruct definition +struct EmptyStruct { +}; + +// CHECK: NamespaceDecl {{.*}} NS1 +namespace NS1 { + // CHECK: CXXRecordDecl {{.*}} struct Foo definition + struct Foo { + float a; + EmptyStruct es; + }; + + // CHECK: CXXRecordDecl {{.*}} struct Bar definition + struct Bar { + // CHECK: CXXRecordDecl {{.*}} struct Foo definition + struct Foo { + int b; + EmptyStruct es; + }; + // CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition + // CHECK: FieldDecl {{.*}} b 'int' + }; + // CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition + // CHECK: FieldDecl {{.*}} a 'float' +} + +struct Foo { + double c; + EmptyStruct es; +}; + +// CHECK: HLSLBufferDecl {{.*}} line:37:9 cbuffer CB1 +// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer +cbuffer CB1 { + // CHECK: VarDecl {{.*}} foo1 'Foo' + Foo foo1; + // CHECK: VarDecl {{.*}} foo2 'NS1::Foo' + NS1::Foo foo2; + // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo' + NS1::Bar::Foo foo3; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB1 definition + // CHECK: FieldDecl {{.*}} foo1 '__layout_Foo' + // CHECK: FieldDecl {{.*}} foo2 'NS1::__layout_Foo' + // CHECK: FieldDecl {{.*}} foo3 'NS1::Bar::__layout_Foo' +} +// CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition +// CHECK: FieldDecl {{.*}} c 'double' + +struct CB1ExpectedShape { + double a1; + float a2; + int a; +}; +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(CB1ExpectedShape, __layout_CB1), ""); + +namespace NS2 { + struct Foo { + float d[4]; + EmptyStruct es; + }; + // CHECK: HLSLBufferDecl {{.*}} line:67:11 cbuffer CB2 + // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer + // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer + cbuffer CB2 { + // CHECK: VarDecl {{.*}} foo0 '::Foo':'Foo' + ::Foo foo0; + // CHECK: VarDecl {{.*}} foo1 'Foo':'NS2::Foo' + Foo foo1; + // CHECK: VarDecl {{.*}} foo2 'NS1::Foo' + NS1::Foo foo2; + // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo' + NS1::Bar::Foo foo3; + // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB2 definition + // CHECK: FieldDecl {{.*}} foo0 '__layout_Foo' + // CHECK: FieldDecl {{.*}} foo1 'NS2::__layout_Foo' + // CHECK: FieldDecl {{.*}} foo2 'NS1::__layout_Foo' + // CHECK: FieldDecl {{.*}} foo3 'NS1::Bar::__layout_Foo' + } + // CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition + // CHECK: FieldDecl {{.*}} d 'float[4]' +} + +struct CB2ExpectedShape { + double a1; + float d[4]; + float a2; + int a; +}; +_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(CB2ExpectedShape, NS2::__layout_CB2), ""); + +// Add uses for the constant buffer declarations so they are not optimized away +// CHECK: ExportDecl +export float f() { + return foo2.a + NS2::foo2.a; +} diff --git a/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl b/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl deleted file mode 100644 index 5e558354cd3a0..0000000000000 --- a/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK:HLSLBufferDecl 0x[[CB:[0-9a-f]+]] {{.*}} line:7:9 cbuffer CB -// CHECK:HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit CBuffer -// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit CBuffer -// CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'float' -cbuffer CB { - float a; -} - -// CHECK:HLSLBufferDecl 0x[[TB:[0-9a-f]+]] {{.*}} line:15:9 tbuffer TB -// CHECK:HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit SRV -// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit TBuffer -// CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'float' -tbuffer TB { - float b; -} - -float foo() { -// CHECK: BinaryOperator 0x{{[0-9a-f]+}} 'float' '+' -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} 'float' -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} 'float' lvalue Var 0x[[A]] 'a' 'float' -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} 'float' -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} 'float' lvalue Var 0x[[B]] 'b' 'float' - return a + b; -} diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl index 281d8be8addf0..3eabbb1f8ae22 100644 --- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl +++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl @@ -15,20 +15,27 @@ tbuffer B { float foo() { return a + b; } + // Make sure cbuffer/tbuffer works for PCH. -// CHECK:HLSLBufferDecl 0x{{[0-9a-f]+}} <{{.*}}:7:1, line:9:1> line:7:9 imported cbuffer A -// CHECK-NEXT:HLSLResourceClassAttr {{.*}} <> Implicit CBuffer -// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit CBuffer -// CHECK-NEXT:`-VarDecl 0x[[A:[0-9a-f]+]] col:9 imported used a 'float' -// CHECK-NEXT:HLSLBufferDecl 0x{{[0-9a-f]+}} line:11:9 imported tbuffer B -// CHECK-NEXT:HLSLResourceClassAttr {{.*}} <> Implicit SRV -// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit TBuffer -// CHECK-NEXT:`-VarDecl 0x[[B:[0-9a-f]+]] col:9 imported used b 'float' -// CHECK-NEXT:FunctionDecl 0x{{[0-9a-f]+}} line:15:7 imported foo 'float ()' -// CHECK-NEXT:CompoundStmt 0x{{[0-9a-f]+}} -// CHECK-NEXT:ReturnStmt 0x{{[0-9a-f]+}} -// CHECK-NEXT:BinaryOperator 0x{{[0-9a-f]+}} 'float' '+' -// CHECK-NEXT:ImplicitCastExpr 0x{{[0-9a-f]+}} 'float' -// CHECK-NEXT:`-DeclRefExpr 0x{{[0-9a-f]+}} 'float' lvalue Var 0x[[A]] 'a' 'float' -// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9a-f]+}} 'float' -// CHECK-NEXT:`-DeclRefExpr 0x{{[0-9a-f]+}} 'float' lvalue Var 0x[[B]] 'b' 'float' +// CHECK: HLSLBufferDecl {{.*}} line:7:9 imported cbuffer A +// CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer +// CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'float' +// CHECK-NEXT: CXXRecordDecl {{.*}} imported implicit class __layout_A definition +// CHECK: FieldDecl {{.*}} imported a 'float' + +// CHECK: HLSLBufferDecl {{.*}} line:11:9 imported tbuffer B +// CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV +// CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer +// CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'float' +// CHECK-NEXT: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} imported implicit class __layout_B definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} {{.*}} imported b 'float' + +// CHECK-NEXT: FunctionDecl {{.*}} line:15:7 imported foo 'float ()' +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-NEXT: ReturnStmt {{.*}} +// CHECK-NEXT: BinaryOperator {{.*}} 'float' '+' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[A]] 'a' 'float' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' +// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[B]] 'b' 'float' From 3861b9db882d5637725ceeccb801c2bb837e8fc5 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 24 Jan 2025 18:27:23 +0000 Subject: [PATCH 039/432] [gn build] Port 0cd794d4860e --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 639095b698c6f..90303821eb09f 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -347,6 +347,7 @@ if (current_toolchain == default_toolchain) { "__chrono/time_zone_link.h", "__chrono/tzdb.h", "__chrono/tzdb_list.h", + "__chrono/utc_clock.h", "__chrono/weekday.h", "__chrono/year.h", "__chrono/year_month.h", From ab976a17121374ae3407374b2aa6306e95863eb3 Mon Sep 17 00:00:00 2001 From: Stephen Long <63318318+steplong@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:02:06 -0500 Subject: [PATCH 040/432] PreISelIntrinsicLowering: Lower llvm.exp/llvm.exp2 to a loop if scalable vec arg (#117568) --- llvm/include/llvm/CodeGen/TargetLowering.h | 4 + .../Transforms/Utils/LowerVectorIntrinsics.h | 30 ++++++++ llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 14 ++++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 11 +++ llvm/lib/Transforms/Utils/CMakeLists.txt | 1 + .../Utils/LowerVectorIntrinsics.cpp | 73 +++++++++++++++++++ .../AArch64/expand-exp.ll | 43 +++++++++++ .../AArch64/lit.local.cfg | 2 + .../llvm/lib/Transforms/Utils/BUILD.gn | 1 + 9 files changed, 179 insertions(+) create mode 100644 llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h create mode 100644 llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 59743dbe4d2ea..861cffdc115a4 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2123,6 +2123,10 @@ class TargetLoweringBase { /// Get the ISD node that corresponds to the Instruction class opcode. int InstructionOpcodeToISD(unsigned Opcode) const; + /// Get the ISD node that corresponds to the Intrinsic ID. Returns + /// ISD::DELETED_NODE by default for an unsupported Intrinsic ID. + int IntrinsicIDToISD(Intrinsic::ID ID) const; + /// @} //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h new file mode 100644 index 0000000000000..cb48bb01e178a --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h @@ -0,0 +1,30 @@ +//===- llvm/Transforms/Utils/LowerVectorIntrinsics.h ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower intrinsics with a scalable vector arg to loops. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H +#define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H + +#include +#include + +namespace llvm { + +class CallInst; +class Module; + +/// Lower \p CI as a loop. \p CI is a unary intrinsic with a vector argument and +/// is deleted and replaced with a loop. +bool lowerUnaryVectorIntrinsicAsLoop(Module &M, CallInst *CI); + +} // namespace llvm + +#endif diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 4a3d1673c2a7c..048a6a49e4cb9 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" +#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h" using namespace llvm; @@ -453,6 +454,19 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { case Intrinsic::objc_sync_exit: Changed |= lowerObjCCall(F, "objc_sync_exit"); break; + case Intrinsic::exp: + case Intrinsic::exp2: + Changed |= forEachCall(F, [&](CallInst *CI) { + Type *Ty = CI->getArgOperand(0)->getType(); + if (!isa(Ty)) + return false; + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID()); + if (!TL->isOperationExpand(Op, EVT::getEVT(Ty))) + return false; + return lowerUnaryVectorIntrinsicAsLoop(M, CI); + }); + break; } } return Changed; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 73af0a9a71407..9c56912aa6ba0 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1841,6 +1841,17 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { llvm_unreachable("Unknown instruction type encountered!"); } +int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const { + switch (ID) { + case Intrinsic::exp: + return ISD::FEXP; + case Intrinsic::exp2: + return ISD::FEXP2; + default: + return ISD::DELETED_NODE; + } +} + Value * TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilderBase &IRB, bool UseTLS) const { diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 65bd3080662c4..78cad0d253be8 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -56,6 +56,7 @@ add_llvm_component_library(LLVMTransformUtils LowerInvoke.cpp LowerMemIntrinsics.cpp LowerSwitch.cpp + LowerVectorIntrinsics.cpp MatrixUtils.cpp MemoryOpRemark.cpp MemoryTaggingSupport.cpp diff --git a/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp new file mode 100644 index 0000000000000..cd716deec14f5 --- /dev/null +++ b/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp @@ -0,0 +1,73 @@ +//===- LowerVectorIntrinsics.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lower-vector-intrinsics" + +using namespace llvm; + +bool llvm::lowerUnaryVectorIntrinsicAsLoop(Module &M, CallInst *CI) { + Type *ArgTy = CI->getArgOperand(0)->getType(); + VectorType *VecTy = cast(ArgTy); + + BasicBlock *PreLoopBB = CI->getParent(); + BasicBlock *PostLoopBB = nullptr; + Function *ParentFunc = PreLoopBB->getParent(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + PostLoopBB = PreLoopBB->splitBasicBlock(CI); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "", ParentFunc, PostLoopBB); + PreLoopBB->getTerminator()->setSuccessor(0, LoopBB); + + // Loop preheader + IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator()); + Value *LoopEnd = nullptr; + if (auto *ScalableVecTy = dyn_cast(VecTy)) { + Value *VScale = PreLoopBuilder.CreateVScale( + ConstantInt::get(PreLoopBuilder.getInt64Ty(), 1)); + Value *N = ConstantInt::get(PreLoopBuilder.getInt64Ty(), + ScalableVecTy->getMinNumElements()); + LoopEnd = PreLoopBuilder.CreateMul(VScale, N); + } else { + FixedVectorType *FixedVecTy = cast(VecTy); + LoopEnd = ConstantInt::get(PreLoopBuilder.getInt64Ty(), + FixedVecTy->getNumElements()); + } + + // Loop body + IRBuilder<> LoopBuilder(LoopBB); + Type *Int64Ty = LoopBuilder.getInt64Ty(); + + PHINode *LoopIndex = LoopBuilder.CreatePHI(Int64Ty, 2); + LoopIndex->addIncoming(ConstantInt::get(Int64Ty, 0U), PreLoopBB); + PHINode *Vec = LoopBuilder.CreatePHI(VecTy, 2); + Vec->addIncoming(CI->getArgOperand(0), PreLoopBB); + + Value *Elem = LoopBuilder.CreateExtractElement(Vec, LoopIndex); + Function *Exp = Intrinsic::getOrInsertDeclaration(&M, CI->getIntrinsicID(), + VecTy->getElementType()); + Value *Res = LoopBuilder.CreateCall(Exp, Elem); + Value *NewVec = LoopBuilder.CreateInsertElement(Vec, Res, LoopIndex); + Vec->addIncoming(NewVec, LoopBB); + + Value *One = ConstantInt::get(Int64Ty, 1U); + Value *NextLoopIndex = LoopBuilder.CreateAdd(LoopIndex, One); + LoopIndex->addIncoming(NextLoopIndex, LoopBB); + + Value *ExitCond = + LoopBuilder.CreateICmp(CmpInst::ICMP_EQ, NextLoopIndex, LoopEnd); + LoopBuilder.CreateCondBr(ExitCond, PostLoopBB, LoopBB); + + CI->replaceAllUsesWith(NewVec); + CI->eraseFromParent(); + return true; +} diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll new file mode 100644 index 0000000000000..284f2ad8072fc --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64" + +define @scalable_vec_exp( %input) { +; CHECK-LABEL: define @scalable_vec_exp( +; CHECK-SAME: [[INPUT:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: br label %[[BB3:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP9:%.*]], %[[BB3]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi [ [[INPUT]], [[TMP0]] ], [ [[TMP8:%.*]], %[[BB3]] ] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement [[TMP5]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.exp.f32(float [[TMP6]]) +; CHECK-NEXT: [[TMP8]] = insertelement [[TMP5]], float [[TMP7]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP9]] = add i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB3]] +; CHECK: [[BB11]]: +; CHECK-NEXT: ret [[TMP8]] +; + %output = call @llvm.exp.nxv4f32( %input) + ret %output +} + +define <4 x float> @fixed_vec_exp(<4 x float> %input) { +; CHECK-LABEL: define <4 x float> @fixed_vec_exp( +; CHECK-SAME: <4 x float> [[INPUT:%.*]]) { +; CHECK-NEXT: [[OUTPUT:%.*]] = call <4 x float> @llvm.exp.v4f32(<4 x float> [[INPUT]]) +; CHECK-NEXT: ret <4 x float> [[OUTPUT]] +; + %output = call <4 x float> @llvm.exp.v4f32(<4 x float> %input) + ret <4 x float> %output +} + +declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0 +declare @llvm.exp.nxv4f32() #0 + +; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK-NEXT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg new file mode 100644 index 0000000000000..10d4a0e953ed4 --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn index 1479e1c355d95..b16fe19bddfd1 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn @@ -64,6 +64,7 @@ static_library("Utils") { "LowerInvoke.cpp", "LowerMemIntrinsics.cpp", "LowerSwitch.cpp", + "LowerVectorIntrinsics.cpp", "MatrixUtils.cpp", "Mem2Reg.cpp", "MemoryOpRemark.cpp", From 83df39c649fe1b1dd556d8f2160999c65ce497eb Mon Sep 17 00:00:00 2001 From: junfengd-nv Date: Fri, 24 Jan 2025 11:06:37 -0800 Subject: [PATCH 041/432] [mlir][inline] Fix Issue#82401: Infinite loop in MLIR inliner for indirect recursive call. (#124026) --- mlir/lib/Transforms/Utils/Inliner.cpp | 4 +- .../test/Transforms/inlining-recursive-2.mlir | 37 +++++++++++++++++++ mlir/test/Transforms/inlining-recursive.mlir | 2 +- 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 mlir/test/Transforms/inlining-recursive-2.mlir diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp index 8acfc96d2b611..756f5e379e7dd 100644 --- a/mlir/lib/Transforms/Utils/Inliner.cpp +++ b/mlir/lib/Transforms/Utils/Inliner.cpp @@ -713,9 +713,11 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) { return false; // Don't allow inlining if the target is a self-recursive function. + // Don't allow inlining if the call graph is like A->B->A. if (llvm::count_if(*resolvedCall.targetNode, [&](CallGraphNode::Edge const &edge) -> bool { - return edge.getTarget() == resolvedCall.targetNode; + return edge.getTarget() == resolvedCall.targetNode || + edge.getTarget() == resolvedCall.sourceNode; }) > 0) return false; diff --git a/mlir/test/Transforms/inlining-recursive-2.mlir b/mlir/test/Transforms/inlining-recursive-2.mlir new file mode 100644 index 0000000000000..e50cf9695c4a3 --- /dev/null +++ b/mlir/test/Transforms/inlining-recursive-2.mlir @@ -0,0 +1,37 @@ +// RUN: mlir-opt %s -inline='default-pipeline=' | FileCheck %s +// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=' | FileCheck %s + +module { + // CHECK-LABEL: func.func @parent1 + func.func @parent1(%arg0: i32) -> i32 { + // CHECK: call @child + %0 = call @child(%arg0) : (i32) -> i32 + return %0 : i32 + } + + // CHECK-LABEL: func.func @parent2 + func.func @parent2(%arg0: i32) -> i32 { + // CHECK: call @child + %0 = call @child(%arg0) : (i32) -> i32 + return %0 : i32 + } + + // CHECK-LABEL: func.func @child + func.func @child(%arg0: i32) -> i32 { + %c10_i32 = arith.constant 10 : i32 + %c1_i32 = arith.constant 1 : i32 + %0 = arith.cmpi sge, %arg0, %c10_i32 : i32 + %1 = scf.if %0 -> (i32) { + scf.yield %arg0 : i32 + } else { + %2 = arith.addi %arg0, %c1_i32 : i32 + // CHECK: call @parent1 + // CHECK: call @parent2 + %3 = func.call @parent1(%2) : (i32) -> i32 + %4 = func.call @parent2(%2) : (i32) -> i32 + %5 = arith.addi %3, %4 : i32 + scf.yield %5 : i32 + } + return %1 : i32 + } +} diff --git a/mlir/test/Transforms/inlining-recursive.mlir b/mlir/test/Transforms/inlining-recursive.mlir index 403accd8b7ee8..f953935475e1a 100644 --- a/mlir/test/Transforms/inlining-recursive.mlir +++ b/mlir/test/Transforms/inlining-recursive.mlir @@ -17,7 +17,7 @@ func.func @foo0(%arg0 : i32) -> i32 { // CHECK-LABEL: func.func @foo1 func.func @foo1(%arg0 : i32) -> i32 { - // CHECK: call @foo1 + // CHECK: call @foo0 %0 = arith.constant 1 : i32 %1 = arith.subi %arg0, %0 : i32 %2 = call @foo0(%1) : (i32) -> i32 From 3b30f20c60d020e43f5700dae68cf1080158b725 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 24 Jan 2025 20:23:18 +0100 Subject: [PATCH 042/432] [libc++][TZDB] Fixes CI. The commit 24e70e3930724ce499ad05d669bfbc4423c542e0 changed internal macros which were used in 0cd794d4860e376698bb4da24bcdf8cbf331835c. This caused build failures on platforms without TZDB support --- libcxx/include/__chrono/convert_to_tm.h | 8 ++++---- libcxx/include/__chrono/formatter.h | 4 ++-- libcxx/include/__chrono/ostream.h | 4 ++-- libcxx/include/__chrono/utc_clock.h | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h index e547e107a5852..7d06a38d87f26 100644 --- a/libcxx/include/__chrono/convert_to_tm.h +++ b/libcxx/include/__chrono/convert_to_tm.h @@ -100,7 +100,7 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const chrono::sys_time<_Duration> __tp } # if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION -# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# if _LIBCPP_HAS_EXPERIMENTAL_TZDB template _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(chrono::utc_time<_Duration> __tp) { @@ -112,7 +112,7 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(chrono::utc_time<_Duration> __tp) { return __result; } -# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB # endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION // Convert a chrono (calendar) time point, or dururation to the given _Tm type, @@ -128,10 +128,10 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) { if constexpr (same_as) return std::__convert_to_tm<_Tm>(__value); # if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION -# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# if _LIBCPP_HAS_EXPERIMENTAL_TZDB else if constexpr (same_as) return std::__convert_to_tm<_Tm>(__value); -# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB # endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION else if constexpr (same_as) return std::__convert_to_tm<_Tm>(_ChronoT::clock::to_sys(__value)); diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index 6153fdc35a47b..d17acd274e4cd 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -721,7 +721,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : pub }; # if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM -# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# if _LIBCPP_HAS_EXPERIMENTAL_TZDB template struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : public __formatter_chrono<_CharT> { @@ -734,7 +734,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : pub } }; -# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB # endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM template diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h index 66735e5c2c28b..ed9ad8e346ba9 100644 --- a/libcxx/include/__chrono/ostream.h +++ b/libcxx/include/__chrono/ostream.h @@ -63,7 +63,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_days& __dp) { } # if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM -# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# if _LIBCPP_HAS_EXPERIMENTAL_TZDB template _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& @@ -71,7 +71,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const utc_time<_Duration>& __tp return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%F %T}"), __tp); } -# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB # endif // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM template diff --git a/libcxx/include/__chrono/utc_clock.h b/libcxx/include/__chrono/utc_clock.h index 647b6eda13ea2..2207b89c92c59 100644 --- a/libcxx/include/__chrono/utc_clock.h +++ b/libcxx/include/__chrono/utc_clock.h @@ -12,7 +12,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +#if _LIBCPP_HAS_EXPERIMENTAL_TZDB # include <__chrono/duration.h> # include <__chrono/leap_second.h> @@ -158,6 +158,6 @@ utc_clock::to_sys(const utc_time<_Duration>& __time) { _LIBCPP_END_NAMESPACE_STD -#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +#endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB #endif // _LIBCPP___CHRONO_UTC_CLOCK_H From 95d993a838863269dc1b90de3808c1e40ac6d5f2 Mon Sep 17 00:00:00 2001 From: Henrich Lauko Date: Fri, 24 Jan 2025 20:28:36 +0100 Subject: [PATCH 043/432] [MLIR] Fix import of calls with mismatched variadic types (#124286) Previously, an indirect call was incorrectly generated when `llvm::CallBase::getCalledFunction` returned null due to a type mismatch between the call and the function. This patch updates the code to use `llvm::CallBase::getCalledOperand` instead. --- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 77 +++++++++++-------- .../test/Target/LLVMIR/Import/instructions.ll | 25 ++++++ 2 files changed, 70 insertions(+), 32 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index f6826a2362bfd..40d86efe605ad 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1495,15 +1495,22 @@ LogicalResult ModuleImport::convertCallTypeAndOperands( if (!callInst->getType()->isVoidTy()) types.push_back(convertType(callInst->getType())); - if (!callInst->getCalledFunction()) { - if (!allowInlineAsm || - !isa(callInst->getCalledOperand())) { - FailureOr called = convertValue(callInst->getCalledOperand()); - if (failed(called)) - return failure(); - operands.push_back(*called); - } + bool isInlineAsm = callInst->isInlineAsm(); + if (isInlineAsm && !allowInlineAsm) + return failure(); + + // Cannot use isIndirectCall() here because we need to handle Constant callees + // that are not considered indirect calls by LLVM. However, in MLIR, they are + // treated as indirect calls to constant operands that need to be converted. + // Skip the callee operand if it's inline assembly, as it's handled separately + // in InlineAsmOp. + if (!isa(callInst->getCalledOperand()) && !isInlineAsm) { + FailureOr called = convertValue(callInst->getCalledOperand()); + if (failed(called)) + return failure(); + operands.push_back(*called); } + SmallVector args(callInst->args()); FailureOr> arguments = convertValues(args); if (failed(arguments)) @@ -1593,7 +1600,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { return success(); } if (inst->getOpcode() == llvm::Instruction::Call) { - auto *callInst = cast(inst); + auto callInst = cast(inst); + llvm::Value *calledOperand = callInst->getCalledOperand(); SmallVector types; SmallVector operands; @@ -1601,15 +1609,12 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { /*allowInlineAsm=*/true))) return failure(); - auto funcTy = - dyn_cast(convertType(callInst->getFunctionType())); - if (!funcTy) - return failure(); - - if (auto asmI = dyn_cast(callInst->getCalledOperand())) { + if (auto asmI = dyn_cast(calledOperand)) { + Type resultTy = convertType(callInst->getType()); + if (!resultTy) + return failure(); auto callOp = builder.create( - loc, funcTy.getReturnType(), operands, - builder.getStringAttr(asmI->getAsmString()), + loc, resultTy, operands, builder.getStringAttr(asmI->getAsmString()), builder.getStringAttr(asmI->getConstraintString()), /*has_side_effects=*/true, /*is_align_stack=*/false, /*asm_dialect=*/nullptr, @@ -1619,27 +1624,35 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { else mapNoResultOp(inst, callOp); } else { - CallOp callOp; + auto funcTy = dyn_cast([&]() -> Type { + // Retrieve the real function type. For direct calls, use the callee's + // function type, as it may differ from the operand type in the case of + // variadic functions. For indirect calls, use the call function type. + if (auto callee = dyn_cast(calledOperand)) + return convertType(callee->getFunctionType()); + return convertType(callInst->getFunctionType()); + }()); + + if (!funcTy) + return failure(); - if (llvm::Function *callee = callInst->getCalledFunction()) { - callOp = builder.create( - loc, funcTy, SymbolRefAttr::get(context, callee->getName()), - operands); - } else { - callOp = builder.create(loc, funcTy, operands); - } + auto callOp = [&]() -> CallOp { + if (auto callee = dyn_cast(calledOperand)) { + auto name = SymbolRefAttr::get(context, callee->getName()); + return builder.create(loc, funcTy, name, operands); + } + return builder.create(loc, funcTy, operands); + }(); + + // Handle function attributes. callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv())); callOp.setTailCallKind( convertTailCallKindFromLLVM(callInst->getTailCallKind())); setFastmathFlagsAttr(inst, callOp); - // Handle function attributes. - if (callInst->hasFnAttr(llvm::Attribute::Convergent)) - callOp.setConvergent(true); - if (callInst->hasFnAttr(llvm::Attribute::NoUnwind)) - callOp.setNoUnwind(true); - if (callInst->hasFnAttr(llvm::Attribute::WillReturn)) - callOp.setWillReturn(true); + callOp.setConvergent(callInst->isConvergent()); + callOp.setNoUnwind(callInst->doesNotThrow()); + callOp.setWillReturn(callInst->hasFnAttr(llvm::Attribute::WillReturn)); llvm::MemoryEffects memEffects = callInst->getMemoryEffects(); ModRefInfo othermem = convertModRefInfoFromLLVM( diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll index 7377e2584110b..77052ab6e41f6 100644 --- a/mlir/test/Target/LLVMIR/Import/instructions.ll +++ b/mlir/test/Target/LLVMIR/Import/instructions.ll @@ -570,6 +570,31 @@ define void @varargs_call(i32 %0) { ; // ----- +; CHECK: @varargs(...) +declare void @varargs(...) + +; CHECK-LABEL: @varargs_call +; CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +define void @varargs_call(i32 %0) { + ; CHECK: llvm.call @varargs(%[[ARG1]]) vararg(!llvm.func) : (i32) -> () + call void @varargs(i32 %0) + ret void +} + +; // ----- + +; CHECK: @varargs(...) +declare void @varargs(...) + +; CHECK-LABEL: @empty_varargs_call +define void @empty_varargs_call() { + ; CHECK: llvm.call @varargs() vararg(!llvm.func) : () -> () + call void @varargs() + ret void +} + +; // ----- + ; CHECK: llvm.func @f() declare void @f() From 1b1270f30bbdb2c7a310009d0512e167b09bac48 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 24 Jan 2025 19:48:40 +0000 Subject: [PATCH 044/432] [FMV][GlobalOpt] Enable static resolution of non-FMV callers. (#124314) The undetectable FMV features predres and ls64 have been removed, therefore the optimization is now re-enabled. The llvm testsuite Graviton4 bots are expected to remain green. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 11 +---------- llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 00c20ad5f3709..9586fc97a39f7 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -95,20 +95,11 @@ STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed"); STATISTIC(NumGlobalArraysPadded, "Number of global arrays padded to alignment boundary"); -// FIXME: -// Optimizing non-FMV callers is causing a regression in the llvm test suite, -// specifically a 'predres' version is unexpectedly trapping on GravitonG4. -// My explanation is that when the caller in not a versioned function, the -// compiler exclusively relies on the command line option, or target attribute -// to deduce whether a feature is available. However, there is no guarantee -// that in reality the host supports those implied features, which arguably -// is a user error. This option allows disabling the optimization as a short -// term workaround to keep the bots green. static cl::opt OptimizeNonFMVCallers("optimize-non-fmv-callers", cl::desc("Statically resolve calls to versioned " "functions from non-versioned callers."), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt EnableColdCCStressTest("enable-coldcc-stress-test", diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index fa817a8cbf417..4b6a19d3f05cf 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -221,7 +221,7 @@ resolver_entry: define i32 @caller4() #8 { ; CHECK-LABEL: define i32 @caller4( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { -; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller() +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() ; entry: %call = tail call i32 @test_non_fmv_caller() From d398c0c97aa0bfaeed5647f75bc37c87b8142f79 Mon Sep 17 00:00:00 2001 From: siya100 <85541510+siya100@users.noreply.github.com> Date: Sat, 25 Jan 2025 02:02:51 +0530 Subject: [PATCH 045/432] [libc][cpio] Add cpio.h header. (#123798) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [libc][docs] add cpio to documentation and include related functi… These changes ensure that the cpio header is documented properly with respect to the issue (https://github.com/llvm/llvm-project/issues/122006 ). **Changes:** 1. **cpio.yaml**: Created a new YAML file for cpio with functions and related macros. 2. **CMakeLists.txt**: Added cpio to the documentation directories. 3. **index.rst**: Included `cpio` in the documentation index. --------- Co-authored-by: siya --- libc/docs/CMakeLists.txt | 1 + libc/docs/headers/index.rst | 1 + libc/utils/docgen/cpio.yaml | 44 +++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 libc/utils/docgen/cpio.yaml diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt index bb8e3e96e47ca..fc5e505c3be69 100644 --- a/libc/docs/CMakeLists.txt +++ b/libc/docs/CMakeLists.txt @@ -37,6 +37,7 @@ if (SPHINX_FOUND) aio arpa/inet assert + cpio ctype errno fenv diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst index d08552d223252..bd48dd5989bcd 100644 --- a/libc/docs/headers/index.rst +++ b/libc/docs/headers/index.rst @@ -8,6 +8,7 @@ Implementation Status arpa/inet assert complex + cpio ctype errno fenv diff --git a/libc/utils/docgen/cpio.yaml b/libc/utils/docgen/cpio.yaml new file mode 100644 index 0000000000000..b31c03778fba5 --- /dev/null +++ b/libc/utils/docgen/cpio.yaml @@ -0,0 +1,44 @@ +macros: + C_IRUSR: + in-latest-posix: '' + C_IWUSR: + in-latest-posix: '' + C_IXUSR: + in-latest-posix: '' + C_IRGRP: + in-latest-posix: '' + C_IWGRP: + in-latest-posix: '' + C_IXGRP: + in-latest-posix: '' + C_IROTH: + in-latest-posix: '' + C_IWOTH: + in-latest-posix: '' + C_IXOTH: + in-latest-posix: '' + C_ISUID: + in-latest-posix: '' + C_ISGID: + in-latest-posix: '' + C_ISVTX: + in-latest-posix: '' + C_ISDIR: + in-latest-posix: '' + C_ISFIFO: + in-latest-posix: '' + C_ISREG: + in-latest-posix: '' + C_ISBLK: + in-latest-posix: '' + C_ISCHR: + in-latest-posix: '' + C_ISCTG: + in-latest-posix: '' + C_ISLNK: + in-latest-posix: '' + C_ISSOCK: + in-latest-posix: '' + MAGIC: + in-latest-posix: '' + From 074a25fb2678dacb4f3c6a24d5f907788c858e7a Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 24 Jan 2025 13:02:33 -0800 Subject: [PATCH 046/432] [RISCV][MC] Create an AsmOperand for carry-in vmask (#124317) Previously we used a fixed assembly string as well as encoding for the carry-in vector mask, since it will always be there. However, this makes both AsmParser and disassembler to either create a garbage MCOperand for the mask or fail to add one as a whole. This wouldn't be a problem for majority of the cases but tools like llvm-mca who relies on MCInst will fail to account for the register dependency on these mask operands. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 4 ++ .../RISCV/Disassembler/RISCVDisassembler.cpp | 10 +++ llvm/lib/Target/RISCV/RISCVInstrInfoV.td | 35 ++++++---- .../MC/Disassembler/RISCV/vmask-carry-in.txt | 69 +++++++++++++++++++ llvm/test/MC/RISCV/rvv/vmask-carry-in.s | 69 +++++++++++++++++++ 5 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt create mode 100644 llvm/test/MC/RISCV/rvv/vmask-carry-in.s diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 227a6361730da..2e86a891863fd 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1681,6 +1681,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); return Error(ErrorLoc, "operand must be v0.t"); } + case Match_InvalidVMaskCarryInRegister: { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); + return Error(ErrorLoc, "operand must be v0"); + } case Match_InvalidSImm5Plus1: { return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4) + 1, (1 << 4), diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index a0b87f7c7ff25..1c4f322e2104e 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -297,6 +297,16 @@ static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint32_t RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodeVMV0RegisterClass(MCInst &Inst, uint32_t RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createReg(RISCV::V0)); + return MCDisassembler::Success; +} + static DecodeStatus decodeVMaskReg(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 24a881dc6810f..671e493fb3763 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -50,6 +50,13 @@ def VMaskAsmOperand : AsmOperandClass { let DiagnosticType = "InvalidVMaskRegister"; } +def VMaskCarryInAsmOperand : AsmOperandClass { + let Name = "RVVMaskCarryInRegOpOperand"; + let RenderMethod = "addRegOperands"; + let PredicateMethod = "isV0Reg"; + let DiagnosticType = "InvalidVMaskCarryInRegister"; +} + def VMaskOp : RegisterOperand { let ParserMatchClass = VMaskAsmOperand; let PrintMethod = "printVMaskReg"; @@ -57,6 +64,11 @@ def VMaskOp : RegisterOperand { let DecoderMethod = "decodeVMaskReg"; } +def VMaskCarryInOp : RegisterOperand { + let ParserMatchClass = VMaskCarryInAsmOperand; + let EncoderMethod = "getVMaskReg"; +} + def simm5 : RISCVSImmLeafOp<5> { let MCOperandPredicate = [{ int64_t Imm; @@ -442,10 +454,8 @@ class VALUVV funct6, RISCVVFormat opv, string opcodestr> // op vd, vs2, vs1, v0 (without mask, use v0 as carry input) class VALUmVV funct6, RISCVVFormat opv, string opcodestr> : RVInstVV { - let vm = 0; -} + (ins VR:$vs2, VR:$vs1, VMaskCarryInOp:$vm), + opcodestr, "$vd, $vs2, $vs1, $vm">; // op vd, vs1, vs2, vm (reverse the order of vs1 and vs2) class VALUrVV funct6, RISCVVFormat opv, string opcodestr, @@ -474,10 +484,8 @@ class VALUVX funct6, RISCVVFormat opv, string opcodestr> // op vd, vs2, rs1, v0 (without mask, use v0 as carry input) class VALUmVX funct6, RISCVVFormat opv, string opcodestr> : RVInstVX { - let vm = 0; -} + (ins VR:$vs2, GPR:$rs1, VMaskCarryInOp:$vm), + opcodestr, "$vd, $vs2, $rs1, $vm">; // op vd, rs1, vs2, vm (reverse the order of rs1 and vs2) class VALUrVX funct6, RISCVVFormat opv, string opcodestr, @@ -506,10 +514,8 @@ class VALUVI funct6, string opcodestr, Operand optype = simm5> // op vd, vs2, imm, v0 (without mask, use v0 as carry input) class VALUmVI funct6, string opcodestr, Operand optype = simm5> : RVInstIVI { - let vm = 0; -} + (ins VR:$vs2, optype:$imm, VMaskCarryInOp:$vm), + opcodestr, "$vd, $vs2, $imm, $vm">; // op vd, vs2, imm, vm class VALUVINoVm funct6, string opcodestr, Operand optype = simm5> @@ -1458,10 +1464,9 @@ defm VFCLASS_V : VCLS_FV_VS2<"vfclass.v", 0b010011, 0b10000>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { // Vector Floating-Point Merge Instruction -let vm = 0 in def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd), - (ins VR:$vs2, FPR32:$rs1, VMV0:$v0), - "vfmerge.vfm", "$vd, $vs2, $rs1, v0">, + (ins VR:$vs2, FPR32:$rs1, VMaskCarryInOp:$vm), + "vfmerge.vfm", "$vd, $vs2, $rs1, $vm">, SchedBinaryMC<"WriteVFMergeV", "ReadVFMergeV", "ReadVFMergeF">; // Vector Floating-Point Move Instruction diff --git a/llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt b/llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt new file mode 100644 index 0000000000000..e9af01ac60b43 --- /dev/null +++ b/llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt @@ -0,0 +1,69 @@ +# RUN: llvm-mc -triple=riscv64 -disassemble -show-inst --mattr=+v %s \ +# RUN: --M no-aliases | FileCheck %s + +# Check if there is a MCOperand for the carry-in mask. + +[0x57,0x04,0x4a,0x5c] +# CHECK: Date: Fri, 24 Jan 2025 13:06:11 -0800 Subject: [PATCH 047/432] [MemProf] Disable hot hints by default (#124338) By default we were marking some contexts as hot, and adding hot hints to unambiguously hot allocations. However, there is not yet support for cloning to expose hot allocation contexts, and none is planned for the forseeable future. While we convert hot contexts to notcold contexts during the cloning step, their existence was greatly limiting the context trimming performed when we add the MemProf profile to the IR. This change simply disables the generation of hot contexts / hints by default, as few allocations were unambiguously hot. A subsequent change will address the issue when hot hints are optionally enabled. See PR124219 for details. This change resulted in significant overhead reductions for a large target: ~48% reduction in the per-module ThinLTO bitcode summary sizes ~72% reduction in the distributed ThinLTO bitcode combined summary sizes ~68% reduction in thin link time ~34% reduction in thin link peak memory --- llvm/lib/Analysis/MemoryProfileInfo.cpp | 10 ++++++++-- llvm/test/Transforms/PGOProfile/memprof.ll | 13 +++++++++++-- .../PGOProfile/memprof_loop_unroll.ll | 4 +++- .../Analysis/MemoryProfileInfoTest.cpp | 18 ++++++++++++++---- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 2f3c87a89f9f9..52f4adbdb0429 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -42,6 +42,11 @@ cl::opt MemProfMinAveLifetimeAccessDensityHotThreshold( cl::desc("The minimum TotalLifetimeAccessDensity / AllocCount for an " "allocation to be considered hot")); +cl::opt + MemProfUseHotHints("memprof-use-hot-hints", cl::init(false), cl::Hidden, + cl::desc("Enable use of hot hints (only supported for " + "unambigously hot allocations)")); + cl::opt MemProfReportHintedSizes( "memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations")); @@ -60,8 +65,9 @@ AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity, // The access densities are multiplied by 100 to hold 2 decimal places of // precision, so need to divide by 100. - if (((float)TotalLifetimeAccessDensity) / AllocCount / 100 > - MemProfMinAveLifetimeAccessDensityHotThreshold) + if (MemProfUseHotHints && + ((float)TotalLifetimeAccessDensity) / AllocCount / 100 > + MemProfMinAveLifetimeAccessDensityHotThreshold) return AllocationType::Hot; return AllocationType::NotCold; diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll index c0e44cccbf16f..367069e993fe1 100644 --- a/llvm/test/Transforms/PGOProfile/memprof.ll +++ b/llvm/test/Transforms/PGOProfile/memprof.ll @@ -84,6 +84,8 @@ ; RUN: llvm-profdata merge -memprof-random-hotness -memprof-random-hotness-seed=1730170724 %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand2 2>&1 | FileCheck %s --check-prefix=RAND2 ; RAND2: random hotness seed = 1730170724 ; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2,ALL,MEMPROFONLY,MEMPROFSTATS +;; Check with hot hints enabled +; RUN: opt < %s -memprof-use-hot-hints -passes='memprof-use' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2HOT ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched @@ -408,8 +410,15 @@ for.end: ; preds = %for.cond ; MEMPROFRAND2: !"cold" ; MEMPROFRAND2: !"cold" ; MEMPROFRAND2: !"cold" -; MEMPROFRAND2: !"hot" -; MEMPROFRAND2: !"hot" +; MEMPROFRAND2: !"notcold" +; MEMPROFRAND2: !"notcold" + +;; With hot hints enabled the last 2 should be hot. +; MEMPROFRAND2HOT: !"cold" +; MEMPROFRAND2HOT: !"cold" +; MEMPROFRAND2HOT: !"cold" +; MEMPROFRAND2HOT: !"hot" +; MEMPROFRAND2HOT: !"hot" ; MEMPROFSTATS: 8 memprof - Number of alloc contexts in memory profile. ; MEMPROFSTATS: 10 memprof - Number of callsites in memory profile. diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll index 9bc1282ab4529..2461ca32e9821 100644 --- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll +++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll @@ -10,7 +10,9 @@ ;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm ; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata -; RUN: opt < %s -passes='memprof-use' -S -memprof-report-hinted-sizes 2>&1 | FileCheck %s +;; Set the minimum lifetime threshold to 0 to ensure that one context is +;; considered cold (the other will be notcold). +; RUN: opt < %s -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s ;; Conservatively annotate as not cold. We get two messages as there are two ;; unrolled copies of the allocation. diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp index 4c177ae844690..3888faf5453d3 100644 --- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp +++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp @@ -25,6 +25,7 @@ using namespace llvm::memprof; extern cl::opt MemProfLifetimeAccessDensityColdThreshold; extern cl::opt MemProfAveLifetimeColdThreshold; extern cl::opt MemProfMinAveLifetimeAccessDensityHotThreshold; +extern cl::opt MemProfUseHotHints; namespace { @@ -81,14 +82,23 @@ TEST_F(MemoryProfileInfoTest, GetAllocType) { // MemProfMinAveLifetimeAccessDensityHotThreshold // so compute the HotTotalLifetimeAccessDensityThreshold at the threshold. const uint64_t HotTotalLifetimeAccessDensityThreshold = - (uint64_t)(MemProfMinAveLifetimeAccessDensityHotThreshold * AllocCount * 100); - - + (uint64_t)(MemProfMinAveLifetimeAccessDensityHotThreshold * AllocCount * + 100); + + // Make sure the option for detecting hot allocations is set. + MemProfUseHotHints = true; // Test Hot // More accesses per byte per sec than hot threshold is hot. EXPECT_EQ(getAllocType(HotTotalLifetimeAccessDensityThreshold + 1, AllocCount, ColdTotalLifetimeThreshold + 1), - AllocationType::Hot); + AllocationType::Hot); + // Undo the manual set of the option above. + cl::ResetAllOptionOccurrences(); + + // Without MemProfUseHotHints (default) we should treat simply as NotCold. + EXPECT_EQ(getAllocType(HotTotalLifetimeAccessDensityThreshold + 1, AllocCount, + ColdTotalLifetimeThreshold + 1), + AllocationType::NotCold); // Test Cold // Long lived with less accesses per byte per sec than cold threshold is cold. From ee054404dfde9913ed47d9bac5ea2be28926f5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 24 Jan 2025 13:09:58 -0800 Subject: [PATCH 048/432] [flang][cuda] Carry over the cuf.proc_attr attribute to gpu.launch_func (#124325) --- .../Optimizer/Transforms/CUFOpConversion.cpp | 5 +++++ flang/test/Fir/CUDA/cuda-launch.fir | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 7292ce741b85b..cc525d703ae57 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -810,6 +810,7 @@ struct CUFLaunchOpConversion rewriter.getContext(), op.getCallee().getLeafReference().getValue())}); mlir::Value clusterDimX, clusterDimY, clusterDimZ; + cuf::ProcAttributeAttr procAttr; if (auto funcOp = symTab.lookup( op.getCallee().getLeafReference())) { if (auto clusterDimsAttr = funcOp->getAttrOfType( @@ -821,6 +822,8 @@ struct CUFLaunchOpConversion clusterDimZ = rewriter.create( loc, clusterDimsAttr.getZ().getInt()); } + procAttr = + funcOp->getAttrOfType(cuf::getProcAttrName()); } llvm::SmallVector args; for (mlir::Value arg : op.getArgs()) { @@ -855,6 +858,8 @@ struct CUFLaunchOpConversion gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY); gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ); } + if (procAttr) + gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr); rewriter.replaceOp(op, gpuLaunchOp); return mlir::success(); } diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir index 8432b9ec926e3..7833fc7b490bf 100644 --- a/flang/test/Fir/CUDA/cuda-launch.fir +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -104,3 +104,24 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e // CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr) -> !fir.ref>>> // CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) + +// ----- + +module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + gpu.module @cuda_device_mod { + gpu.func @_QMdevptrPtest() kernel { + gpu.return + } + } + func.func @_QMdevptrPtest() attributes {cuf.proc_attr = #cuf.cuda_proc} { + return + } + func.func @_QQmain() { + %c1_i32 = arith.constant 1 : i32 + cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>() + return + } +} + +// CHECK-LABEL: func.func @_QQmain() +// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} {cuf.proc_attr = #cuf.cuda_proc} From df9b31f1e0cdb8096e9d2e0749e473dd815b39f7 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Fri, 24 Jan 2025 16:11:18 -0500 Subject: [PATCH 049/432] [clang][Sema] Handle undeduced auto types in HeuristicResolver (#124236) Fixes https://github.com/clangd/clangd/issues/897 --- clang/lib/Sema/HeuristicResolver.cpp | 17 +++++++- .../unittests/Sema/HeuristicResolverTest.cpp | 40 +++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp index 2a726fe51d355..e893afed71d26 100644 --- a/clang/lib/Sema/HeuristicResolver.cpp +++ b/clang/lib/Sema/HeuristicResolver.cpp @@ -227,6 +227,7 @@ std::vector HeuristicResolverImpl::resolveMemberExpr( } // Try resolving the member inside the expression's base type. + Expr *Base = ME->isImplicitAccess() ? nullptr : ME->getBase(); QualType BaseType = ME->getBaseType(); if (ME->isArrow()) { BaseType = getPointeeType(BaseType); @@ -237,11 +238,25 @@ std::vector HeuristicResolverImpl::resolveMemberExpr( // If BaseType is the type of a dependent expression, it's just // represented as BuiltinType::Dependent which gives us no information. We // can get further by analyzing the dependent expression. - Expr *Base = ME->isImplicitAccess() ? nullptr : ME->getBase(); if (Base && BT->getKind() == BuiltinType::Dependent) { BaseType = resolveExprToType(Base); } } + if (const auto *AT = BaseType->getContainedAutoType()) { + // If BaseType contains a dependent `auto` type, deduction will not have + // been performed on it yet. In simple cases (e.g. `auto` variable with + // initializer), get the approximate type that would result from deduction. + // FIXME: A more accurate implementation would propagate things like the + // `const` in `const auto`. + if (AT->isUndeducedAutoType()) { + if (const auto *DRE = dyn_cast(Base)) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + if (VD->hasInit()) + BaseType = resolveExprToType(VD->getInit()); + } + } + } + } return resolveDependentMember(BaseType, ME->getMember(), NoFilter); } diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp index 2cd5486b3227f..2b775b11719ea 100644 --- a/clang/unittests/Sema/HeuristicResolverTest.cpp +++ b/clang/unittests/Sema/HeuristicResolverTest.cpp @@ -155,6 +155,46 @@ TEST(HeuristicResolver, MemberExpr_SmartPointer_Qualified) { cxxMethodDecl(hasName("find"), isConst()).bind("output")); } +TEST(HeuristicResolver, MemberExpr_AutoTypeDeduction1) { + std::string Code = R"cpp( + template + struct A { + int waldo; + }; + template + void foo(A a) { + auto copy = a; + copy.waldo; + } + )cpp"; + expectResolution( + Code, &HeuristicResolver::resolveMemberExpr, + cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"), + fieldDecl(hasName("waldo")).bind("output")); +} + +TEST(HeuristicResolver, MemberExpr_AutoTypeDeduction2) { + std::string Code = R"cpp( + struct B { + int waldo; + }; + + template + struct A { + B b; + }; + template + void foo(A a) { + auto b = a.b; + b.waldo; + } + )cpp"; + expectResolution( + Code, &HeuristicResolver::resolveMemberExpr, + cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"), + fieldDecl(hasName("waldo")).bind("output")); +} + TEST(HeuristicResolver, MemberExpr_Chained) { std::string Code = R"cpp( struct A { void foo() {} }; From 73b462321c2968a450779f8f6c240f46a1830376 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Fri, 24 Jan 2025 13:39:20 -0800 Subject: [PATCH 050/432] [libc] Include size_t type header in strings.h (#124352) A number of functions in strings.h take size_t as an argument. --- libc/include/strings.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/include/strings.yaml b/libc/include/strings.yaml index e672dca6a94dd..b6aa8f6d60b27 100644 --- a/libc/include/strings.yaml +++ b/libc/include/strings.yaml @@ -1,7 +1,8 @@ header: strings.h header_template: strings.h.def macros: [] -types: [] +types: + - type_name: size_t enums: [] objects: [] functions: From b41987beaedaa6ea78fd8dd11ba8c3b21eb8fa88 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 24 Jan 2025 13:59:32 -0800 Subject: [PATCH 051/432] [SandboxVec][DAG] Fix MemDGNode chain maintenance when move destination is non-mem (#124227) This patch fixes a bug in the maintenance of the MemDGNode chain of the DAG. Whenever we move a memory instruction, the DAG gets notified about the move and maintains the chain of memory nodes. The bug was that if the destination of the move was not a memory instruction, then the memory node's next node would end up pointing to itself. --- .../SandboxVectorizer/DependencyGraph.h | 16 +++-- .../SandboxVectorizer/DependencyGraph.cpp | 70 +++++++++++++------ .../SandboxVectorizer/DependencyGraphTest.cpp | 43 ++++++++++++ 3 files changed, 103 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index b2d7c9b8aa8bb..6e3f99d78b932 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -218,12 +218,14 @@ class MemDGNode final : public DGNode { friend class PredIterator; // For MemPreds. /// Creates both edges: this<->N. void setNextNode(MemDGNode *N) { + assert(N != this && "About to point to self!"); NextMemN = N; if (NextMemN != nullptr) NextMemN->PrevMemN = this; } /// Creates both edges: N<->this. void setPrevNode(MemDGNode *N) { + assert(N != this && "About to point to self!"); PrevMemN = N; if (PrevMemN != nullptr) PrevMemN->NextMemN = this; @@ -348,13 +350,15 @@ class DependencyGraph { void createNewNodes(const Interval &NewInterval); /// Helper for `notify*Instr()`. \Returns the first MemDGNode that comes - /// before \p N, including or excluding \p N based on \p IncludingN, or - /// nullptr if not found. - MemDGNode *getMemDGNodeBefore(DGNode *N, bool IncludingN) const; + /// before \p N, skipping \p SkipN, including or excluding \p N based on + /// \p IncludingN, or nullptr if not found. + MemDGNode *getMemDGNodeBefore(DGNode *N, bool IncludingN, + MemDGNode *SkipN = nullptr) const; /// Helper for `notifyMoveInstr()`. \Returns the first MemDGNode that comes - /// after \p N, including or excluding \p N based on \p IncludingN, or nullptr - /// if not found. - MemDGNode *getMemDGNodeAfter(DGNode *N, bool IncludingN) const; + /// after \p N, skipping \p SkipN, including or excluding \p N based on \p + /// IncludingN, or nullptr if not found. + MemDGNode *getMemDGNodeAfter(DGNode *N, bool IncludingN, + MemDGNode *SkipN = nullptr) const; /// Called by the callbacks when a new instruction \p I has been created. void notifyCreateInstr(Instruction *I); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index f080111f08d45..390a5e9688cc7 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -325,29 +325,31 @@ void DependencyGraph::createNewNodes(const Interval &NewInterval) { setDefUseUnscheduledSuccs(NewInterval); } -MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N, - bool IncludingN) const { +MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N, bool IncludingN, + MemDGNode *SkipN) const { auto *I = N->getInstruction(); for (auto *PrevI = IncludingN ? I : I->getPrevNode(); PrevI != nullptr; PrevI = PrevI->getPrevNode()) { auto *PrevN = getNodeOrNull(PrevI); if (PrevN == nullptr) return nullptr; - if (auto *PrevMemN = dyn_cast(PrevN)) + auto *PrevMemN = dyn_cast(PrevN); + if (PrevMemN != nullptr && PrevMemN != SkipN) return PrevMemN; } return nullptr; } -MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N, - bool IncludingN) const { +MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N, bool IncludingN, + MemDGNode *SkipN) const { auto *I = N->getInstruction(); for (auto *NextI = IncludingN ? I : I->getNextNode(); NextI != nullptr; NextI = NextI->getNextNode()) { auto *NextN = getNodeOrNull(NextI); if (NextN == nullptr) return nullptr; - if (auto *NextMemN = dyn_cast(NextN)) + auto *NextMemN = dyn_cast(NextN); + if (NextMemN != nullptr && NextMemN != SkipN) return NextMemN; } return nullptr; @@ -377,6 +379,20 @@ void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) { !(To == BB->end() && std::next(I->getIterator()) == BB->end()) && "Should not have been called if destination is same as origin."); + // TODO: We can only handle fully internal movements within DAGInterval or at + // the borders, i.e., right before the top or right after the bottom. + assert(To.getNodeParent() == I->getParent() && + "TODO: We don't support movement across BBs!"); + assert( + (To == std::next(DAGInterval.bottom()->getIterator()) || + (To != BB->end() && std::next(To) == DAGInterval.top()->getIterator()) || + (To != BB->end() && DAGInterval.contains(&*To))) && + "TODO: To should be either within the DAGInterval or right " + "before/after it."); + + // Make a copy of the DAGInterval before we update it. + auto OrigDAGInterval = DAGInterval; + // Maintain the DAGInterval. DAGInterval.notifyMoveInstr(I, To); @@ -389,23 +405,37 @@ void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) { MemDGNode *MemN = dyn_cast(N); if (MemN == nullptr) return; - // First detach it from the existing chain. + + // First safely detach it from the existing chain. MemN->detachFromChain(); + // Now insert it back into the chain at the new location. - if (To != BB->end()) { - DGNode *ToN = getNodeOrNull(&*To); - if (ToN != nullptr) { - MemN->setPrevNode(getMemDGNodeBefore(ToN, /*IncludingN=*/false)); - MemN->setNextNode(getMemDGNodeAfter(ToN, /*IncludingN=*/true)); - } + // + // We won't always have a DGNode to insert before it. If `To` is BB->end() or + // if it points to an instr after DAGInterval.bottom() then we will have to + // find a node to insert *after*. + // + // BB: BB: + // I1 I1 ^ + // I2 I2 | DAGInteval [I1 to I3] + // I3 I3 V + // I4 I4 <- `To` == right after DAGInterval + // <- `To` == BB->end() + // + if (To == BB->end() || + To == std::next(OrigDAGInterval.bottom()->getIterator())) { + // If we don't have a node to insert before, find a node to insert after and + // update the chain. + DGNode *InsertAfterN = getNode(&*std::prev(To)); + MemN->setPrevNode( + getMemDGNodeBefore(InsertAfterN, /*IncludingN=*/true, /*SkipN=*/MemN)); } else { - // MemN becomes the last instruction in the BB. - auto *TermN = getNodeOrNull(BB->getTerminator()); - if (TermN != nullptr) { - MemN->setPrevNode(getMemDGNodeBefore(TermN, /*IncludingN=*/false)); - } else { - // The terminator is outside the DAG interval so do nothing. - } + // We have a node to insert before, so update the chain. + DGNode *BeforeToN = getNode(&*To); + MemN->setPrevNode( + getMemDGNodeBefore(BeforeToN, /*IncludingN=*/false, /*SkipN=*/MemN)); + MemN->setNextNode( + getMemDGNodeAfter(BeforeToN, /*IncludingN=*/true, /*SkipN=*/MemN)); } } diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index 3fa4de501f3f5..29fc05a7f256a 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -926,3 +926,46 @@ define void @foo(ptr %ptr, ptr %ptr2, i8 %v1, i8 %v2, i8 %v3, i8 %arg) { EXPECT_EQ(LdN->getPrevNode(), S1N); EXPECT_EQ(LdN->getNextNode(), S2N); } + +// Check that the mem chain is maintained correctly when the move destination is +// not a mem node. +TEST_F(DependencyGraphTest, MoveInstrCallbackWithNonMemInstrs) { + parseIR(C, R"IR( +define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %arg) { + %ld = load i8, ptr %ptr + %zext1 = zext i8 %arg to i32 + %zext2 = zext i8 %arg to i32 + store i8 %v1, ptr %ptr + store i8 %v2, ptr %ptr + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Ld = cast(&*It++); + [[maybe_unused]] auto *Zext1 = cast(&*It++); + auto *Zext2 = cast(&*It++); + auto *S1 = cast(&*It++); + auto *S2 = cast(&*It++); + auto *Ret = cast(&*It++); + + sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx); + DAG.extend({Ld, S2}); + auto *LdN = cast(DAG.getNode(Ld)); + auto *S1N = cast(DAG.getNode(S1)); + auto *S2N = cast(DAG.getNode(S2)); + EXPECT_EQ(LdN->getNextNode(), S1N); + EXPECT_EQ(S1N->getNextNode(), S2N); + + S1->moveBefore(Zext2); + EXPECT_EQ(LdN->getNextNode(), S1N); + EXPECT_EQ(S1N->getNextNode(), S2N); + + // Try move right after the end of the DAGInterval. + S1->moveBefore(Ret); + EXPECT_EQ(S2N->getNextNode(), S1N); + EXPECT_EQ(S1N->getNextNode(), nullptr); +} From 425d25f5df4c6814e5551640b810bec53322f3df Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi <56735936+hjyamauchi@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:01:41 -0800 Subject: [PATCH 052/432] [AArch64][WinCFI] Fix a crash due to missing seh directives (#123993) https://github.com/llvm/llvm-project/issues/123808 --- .../Target/AArch64/AArch64FrameLowering.cpp | 22 +++-- .../CodeGen/AArch64/stack-hazard-windows.ll | 4 + .../AArch64/wincfi-missing-seh-directives.ll | 86 +++++++++++++++++++ 3 files changed, 103 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index eabe64361938b..a082a1ebe95bf 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1491,13 +1491,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( NewOpc = AArch64::LDRQpost; break; } - // Get rid of the SEH code associated with the old instruction. - if (NeedsWinCFI) { - auto SEH = std::next(MBBI); - if (AArch64InstrInfo::isSEHInstruction(*SEH)) - SEH->eraseFromParent(); - } - TypeSize Scale = TypeSize::getFixed(1), Width = TypeSize::getFixed(0); int64_t MinOffset, MaxOffset; bool Success = static_cast(TII)->getMemOpInfo( @@ -1512,16 +1505,27 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) { // If we are destroying the frame, make sure we add the increment after the // last frame operation. - if (FrameFlag == MachineInstr::FrameDestroy) + if (FrameFlag == MachineInstr::FrameDestroy) { ++MBBI; + // Also skip the SEH instruction, if needed + if (NeedsWinCFI && AArch64InstrInfo::isSEHInstruction(*MBBI)) + ++MBBI; + } emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag, - false, false, nullptr, EmitCFI, + false, NeedsWinCFI, HasWinCFI, EmitCFI, StackOffset::getFixed(CFAOffset)); return std::prev(MBBI); } + // Get rid of the SEH code associated with the old instruction. + if (NeedsWinCFI) { + auto SEH = std::next(MBBI); + if (AArch64InstrInfo::isSEHInstruction(*SEH)) + SEH->eraseFromParent(); + } + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); MIB.addReg(AArch64::SP, RegState::Define); diff --git a/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll b/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll index 2a034fe5e5290..927d8b68c46be 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll @@ -76,7 +76,9 @@ define i32 @fpr_csr_stackobj(double %x) "aarch64_pstate_sm_compatible" "frame-po ; CHECK1024: .seh_proc fpr_csr_stackobj ; CHECK1024-NEXT: // %bb.0: // %entry ; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: .seh_stackalloc 1072 ; CHECK1024-NEXT: str x23, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: .seh_save_reg x23, 0 ; CHECK1024-NEXT: str x29, [sp, #8] // 8-byte Folded Spill ; CHECK1024-NEXT: .seh_save_reg x29, 8 ; CHECK1024-NEXT: str x30, [sp, #16] // 8-byte Folded Spill @@ -105,7 +107,9 @@ define i32 @fpr_csr_stackobj(double %x) "aarch64_pstate_sm_compatible" "frame-po ; CHECK1024-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload ; CHECK1024-NEXT: .seh_save_reg x29, 8 ; CHECK1024-NEXT: ldr x23, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: .seh_save_reg x23, 0 ; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .seh_stackalloc 1072 ; CHECK1024-NEXT: .seh_endepilogue ; CHECK1024-NEXT: ret ; CHECK1024-NEXT: .seh_endfunclet diff --git a/llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll b/llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll new file mode 100644 index 0000000000000..2002c37cb2528 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll @@ -0,0 +1,86 @@ +; RUN: llc -mtriple=aarch64-windows %s --filetype obj -o /dev/null +; RUN: llc -mtriple=aarch64-windows %s --filetype asm -o - | FileCheck %s + +; Check that it doesn't crash and that each instruction in the +; prologue has a corresponding seh directive. +; +; CHECK-NOT: error: Incorrect size for +; CHECK: foo: +; CHECK: .seh_proc foo +; CHECK: sub sp, sp, #288 +; CHECK: .seh_stackalloc 288 +; CHECK: str x19, [sp] // 8-byte Folded Spill +; CHECK: .seh_save_reg x19, 0 +; CHECK: str x21, [sp, #8] // 8-byte Folded Spill +; CHECK: .seh_save_reg x21, 8 +; CHECK: stp x23, x24, [sp, #16] // 16-byte Folded Spill +; CHECK: .seh_save_regp x23, 16 +; CHECK: stp x25, x26, [sp, #32] // 16-byte Folded Spill +; CHECK: .seh_save_regp x25, 32 +; CHECK: stp x27, x28, [sp, #48] // 16-byte Folded Spill +; CHECK: .seh_save_regp x27, 48 +; CHECK: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK: .seh_save_fplr 64 +; CHECK: sub sp, sp, #224 +; CHECK: .seh_stackalloc 224 +; CHECK: .seh_endprologue + +target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-windows-msvc19.42.34436" + +%swift.refcounted = type { ptr, i64 } +%TScA_pSg = type <{ [16 x i8] }> +%T5repro4TestVSg = type <{ [32 x i8] }> +%T5repro4TestV = type <{ %TSS, %TSS }> +%TSS = type <{ %Ts11_StringGutsV }> +%Ts11_StringGutsV = type <{ %Ts13_StringObjectV }> +%Ts13_StringObjectV = type <{ %Ts6UInt64V, ptr }> +%Ts6UInt64V = type <{ i64 }> + +declare swiftcc ptr @swift_task_alloc() + +declare swifttailcc void @bar(ptr, ptr, i64, i64, i64, ptr, i64, i64, i64, i64, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr) + +define swifttailcc void @foo(ptr %0, ptr swiftasync %1, ptr swiftself %2, ptr %3, ptr %._guts2._object._object, ptr %.rid4._guts._object._object, ptr %4, ptr %.idx8, ptr %.idx8._guts._object._object, ptr %5, ptr %.rid9._guts._object._object, ptr %6) { +entry: + %7 = load i64, ptr null, align 8 + %8 = load i64, ptr %3, align 8 + %9 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 2 + %10 = load i64, ptr %9, align 8 + %11 = load ptr, ptr %1, align 8 + %12 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 3 + %13 = load i64, ptr %.rid9._guts._object._object, align 8 + %14 = load i64, ptr %.idx8._guts._object._object, align 8 + %15 = load i64, ptr %5, align 8 + %16 = getelementptr { i64, i64, i64, i64 }, ptr %12, i32 0, i32 3 + %17 = load i64, ptr %16, align 8 + %18 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 4 + %19 = load i64, ptr %18, align 8 + %.rid._guts._object._object = getelementptr %Ts13_StringObjectV, ptr %18, i32 0, i32 1 + %20 = load ptr, ptr %.rid._guts._object._object, align 8 + %21 = load i64, ptr %.rid4._guts._object._object, align 8 + %22 = load i64, ptr %0, align 8 + %23 = load ptr, ptr %6, align 8 + %24 = load i64, ptr %2, align 8 + %25 = load ptr, ptr %._guts2._object._object, align 8 + %26 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 7 + %27 = load i64, ptr %26, align 8 + %._guts3._object._object = getelementptr %Ts13_StringObjectV, ptr %26, i32 0, i32 1 + %28 = load ptr, ptr %._guts3._object._object, align 8 + %29 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 8 + %30 = load i64, ptr %29, align 8 + %.idx5 = getelementptr %T5repro4TestV, ptr %29, i32 0, i32 1 + %31 = load i64, ptr %.idx5, align 8 + %.idx5._guts._object._object = getelementptr %Ts13_StringObjectV, ptr %.idx5, i32 0, i32 1 + %32 = load ptr, ptr %.idx5._guts._object._object, align 8 + %33 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 9 + %34 = load i64, ptr %33, align 8 + %35 = load i64, ptr %4, align 8 + %36 = load i64, ptr %.idx8, align 8 + %37 = load i64, ptr %1, align 8 + %38 = call swiftcc ptr @swift_task_alloc() + store ptr null, ptr %3, align 8 + store ptr null, ptr %4, align 8 + musttail call swifttailcc void @bar(ptr null, ptr swiftasync %.rid4._guts._object._object, i64 %7, i64 %8, i64 %10, ptr %5, i64 %13, i64 %14, i64 %15, i64 %17, i64 %19, ptr %20, i64 %21, ptr %.idx8, i64 %22, ptr %23, i64 %24, ptr %25, i64 %27, ptr %28, i64 %30, ptr %.idx8._guts._object._object, i64 %31, ptr %32, i64 %34, ptr %._guts2._object._object, i64 %35, ptr %2, i64 %36, ptr %1, i64 %37, ptr %0, i64 0, ptr null, i64 0, ptr null) + ret void +} From 77c23fd0aa1534abe904c2d5256a6d7879dc3cf7 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 24 Jan 2025 14:12:18 -0800 Subject: [PATCH 053/432] [AMDGPU] Update AMDGPUUsage.rst to document two intrinsics (#123816) The AMDGPUUsage.rst file is updated to document two intrinsics: llvm.amdgcn.mov.dpp and llvm.amdgcn.update.dpp. --- llvm/docs/AMDGPUUsage.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 40b393224f15d..8f09df2406f10 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1422,6 +1422,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics. Returns a pair for the swapped registers. The first element of the return corresponds to the swapped element of the first argument. + llvm.amdgcn.mov.dpp The llvm.amdgcn.mov.dpp.`` intrinsic represents the mov.dpp operation in AMDGPU. + This operation is being deprecated and can be replaced with llvm.amdgcn.update.dpp. + + llvm.amdgcn.update.dpp The llvm.amdgcn.update.dpp.`` intrinsic represents the update.dpp operation in AMDGPU. + It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control. + Various data types are supported, including, bf16, f16, f32, f64, i16, i32, i64, p0, p3, p5, v2f16, v2f32, v2i16, v2i32, v2p0, v3i32, v4i32, v8f16. + This operation is equivalent to a sequence of v_mov_b32 operations. + It is preferred over llvm.amdgcn.mov.dpp.`` for future use. + `llvm.amdgcn.update.dpp. ` + Should be equivalent to: + - `v_mov_b32 ` + - `v_mov_b32 ` + ============================================== ========================================================== .. TODO:: From 34c6c5e72f48de65a7e332033af9566576c1895d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 24 Jan 2025 14:20:24 -0800 Subject: [PATCH 054/432] [BOLT][AArch64] Fix PLT optimization (#124192) Preserve C++ exception metadata while running PLT optimization on AArch64. --- bolt/include/bolt/Core/MCPlusBuilder.h | 7 ++++--- bolt/lib/Passes/PLTCall.cpp | 4 ++-- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 5 ++--- bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 2 +- bolt/test/AArch64/exceptions-plt.cpp | 21 +++++++++++++++++++ bolt/test/runtime/exceptions-plt.cpp | 16 ++++++++++++++ 6 files changed, 46 insertions(+), 9 deletions(-) create mode 100644 bolt/test/AArch64/exceptions-plt.cpp create mode 100644 bolt/test/runtime/exceptions-plt.cpp diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 5d77e6faff2fc..c1460b2aac8a6 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1426,11 +1426,12 @@ class MCPlusBuilder { } /// Creates an indirect call to the function within the \p DirectCall PLT - /// stub. The function's memory location is pointed by the \p TargetLocation + /// stub. The function's address location is pointed by the \p TargetLocation /// symbol. + /// Move instruction annotations from \p DirectCall to the indirect call. virtual InstructionListType - createIndirectPltCall(const MCInst &DirectCall, - const MCSymbol *TargetLocation, MCContext *Ctx) { + createIndirectPLTCall(MCInst &&DirectCall, const MCSymbol *TargetLocation, + MCContext *Ctx) { llvm_unreachable("not implemented"); return {}; } diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp index 2ed996fadbb99..31c2d92ebc204 100644 --- a/bolt/lib/Passes/PLTCall.cpp +++ b/bolt/lib/Passes/PLTCall.cpp @@ -70,8 +70,8 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) { const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CallSymbol); if (!CalleeBF || !CalleeBF->isPLTFunction()) continue; - const InstructionListType NewCode = BC.MIB->createIndirectPltCall( - *II, CalleeBF->getPLTSymbol(), BC.Ctx.get()); + const InstructionListType NewCode = BC.MIB->createIndirectPLTCall( + std::move(*II), CalleeBF->getPLTSymbol(), BC.Ctx.get()); II = BB.replaceInstruction(II, NewCode); assert(!NewCode.empty() && "PLT Call replacement must be non-empty"); std::advance(II, NewCode.size() - 1); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 0b6f21527f0ac..ac709c5dd063a 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -1263,7 +1263,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return true; } - InstructionListType createIndirectPltCall(const MCInst &DirectCall, + InstructionListType createIndirectPLTCall(MCInst &&DirectCall, const MCSymbol *TargetLocation, MCContext *Ctx) override { const bool IsTailCall = isTailCall(DirectCall); @@ -1297,8 +1297,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { MCInst InstCall; InstCall.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR); InstCall.addOperand(MCOperand::createReg(AArch64::X17)); - if (IsTailCall) - setTailCall(InstCall); + moveAnnotations(std::move(DirectCall), InstCall); Code.emplace_back(InstCall); return Code; diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 63086c06d74fd..465533ee71f2b 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -1605,7 +1605,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } - InstructionListType createIndirectPltCall(const MCInst &DirectCall, + InstructionListType createIndirectPLTCall(MCInst &&DirectCall, const MCSymbol *TargetLocation, MCContext *Ctx) override { assert((DirectCall.getOpcode() == X86::CALL64pcrel32 || diff --git a/bolt/test/AArch64/exceptions-plt.cpp b/bolt/test/AArch64/exceptions-plt.cpp new file mode 100644 index 0000000000000..576f0fc91a9d8 --- /dev/null +++ b/bolt/test/AArch64/exceptions-plt.cpp @@ -0,0 +1,21 @@ +// Verify that PLT optimization in BOLT preserves exception-handling info. + +// REQUIRES: system-linux + +// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s -o %t.exe +// RUN: llvm-bolt %t.exe -o %t.bolt.exe --plt=all --print-only=.*main.* \ +// RUN: --print-finalized 2>&1 | FileCheck %s + +// CHECK-LABEL: Binary Function +// CHECK: adrp {{.*}}__cxa_throw +// CHECK-NEXT: ldr {{.*}}__cxa_throw +// CHECK-NEXT: blr x17 {{.*}} handler: {{.*}} PLTCall: + +int main() { + try { + throw new int; + } catch (...) { + return 0; + } + return 1; +} diff --git a/bolt/test/runtime/exceptions-plt.cpp b/bolt/test/runtime/exceptions-plt.cpp new file mode 100644 index 0000000000000..8a75a3cb384b9 --- /dev/null +++ b/bolt/test/runtime/exceptions-plt.cpp @@ -0,0 +1,16 @@ +// Verify that PLT optimization in BOLT preserves exception-handling info. + +// REQUIRES: system-linux + +// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s -o %t.exe +// RUN: llvm-bolt %t.exe -o %t.bolt.exe --plt=all +// RUN: %t.bolt.exe + +int main() { + try { + throw new int; + } catch (...) { + return 0; + } + return 1; +} From 4b209c5d87c8b8eb4bbf2750ea9daa5927a13699 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 24 Jan 2025 14:28:55 -0800 Subject: [PATCH 055/432] [SandboxIR][Region] Add cost modeling to the region (#124354) This patch implements cost modeling for Region. All instructions that are added or removed get their cost counted in the Scoreboard. This is used for checking if the region before or after a transformation is more profitable. --- llvm/include/llvm/SandboxIR/Region.h | 52 +++++++++++-- llvm/include/llvm/SandboxIR/Value.h | 1 + llvm/lib/SandboxIR/Region.cpp | 35 ++++++++- .../Passes/RegionsFromMetadata.cpp | 2 +- llvm/unittests/SandboxIR/PassTest.cpp | 8 +- llvm/unittests/SandboxIR/RegionTest.cpp | 73 ++++++++++++++++--- 6 files changed, 151 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h index 8133e01734ea7..c1195141cb54c 100644 --- a/llvm/include/llvm/SandboxIR/Region.h +++ b/llvm/include/llvm/SandboxIR/Region.h @@ -6,18 +6,55 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H -#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H +#ifndef LLVM_SANDBOXIR_REGION_H +#define LLVM_SANDBOXIR_REGION_H #include #include "llvm/ADT/SetVector.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/raw_ostream.h" namespace llvm::sandboxir { +class Region; + +class ScoreBoard { + const Region &Rgn; + TargetTransformInfo &TTI; + constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + /// The cost of all instructions added to the region. + InstructionCost AfterCost = 0; + /// The cost of all instructions that got removed and replaced by new ones. + InstructionCost BeforeCost = 0; + /// Helper for both add() and remove(). \Returns the TTI cost of \p I. + InstructionCost getCost(Instruction *I) const; + /// No need to allow copies. + ScoreBoard(const ScoreBoard &) = delete; + const ScoreBoard &operator=(const ScoreBoard &) = delete; + +public: + ScoreBoard(Region &Rgn, TargetTransformInfo &TTI) : Rgn(Rgn), TTI(TTI) {} + /// Mark \p I as a newly added instruction to the region. + void add(Instruction *I) { AfterCost += getCost(I); } + /// Mark \p I as a deleted instruction from the region. + void remove(Instruction *I); + /// \Returns the cost of the newly added instructions. + InstructionCost getAfterCost() const { return AfterCost; } + /// \Returns the cost of the Removed instructions. + InstructionCost getBeforeCost() const { return BeforeCost; } + +#ifndef NDEBUG + void dump(raw_ostream &OS) const { + OS << "BeforeCost: " << BeforeCost << "\n"; + OS << "AfterCost: " << AfterCost << "\n"; + } + LLVM_DUMP_METHOD void dump() const; +#endif // NDEBUG +}; + /// The main job of the Region is to point to new instructions generated by /// vectorization passes. It is the unit that RegionPasses operate on with their /// runOnRegion() function. @@ -62,6 +99,8 @@ class Region { static constexpr const char *RegionStr = "sandboxregion"; Context &Ctx; + /// Keeps track of cost of instructions added and removed. + ScoreBoard Scoreboard; /// ID (for later deregistration) of the "create instruction" callback. Context::CallbackID CreateInstCB; @@ -72,7 +111,7 @@ class Region { // TODO: Add a way to encode/decode region info to/from metadata. public: - Region(Context &Ctx); + Region(Context &Ctx, TargetTransformInfo &TTI); ~Region(); Context &getContext() const { return Ctx; } @@ -91,7 +130,10 @@ class Region { iterator end() { return Insts.end(); } iterator_range insts() { return make_range(begin(), end()); } - static SmallVector> createRegionsFromMD(Function &F); + static SmallVector> + createRegionsFromMD(Function &F, TargetTransformInfo &TTI); + /// \Returns the ScoreBoard data structure that keeps track of instr costs. + const ScoreBoard &getScoreboard() const { return Scoreboard; } #ifndef NDEBUG /// This is an expensive check, meant for testing. @@ -109,4 +151,4 @@ class Region { } // namespace llvm::sandboxir -#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H +#endif // LLVM_SANDBOXIR_REGION_H diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index 243195f4c1c4b..28e33ca0f2312 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -167,6 +167,7 @@ class Value { // Region needs to manipulate metadata in the underlying LLVM Value, we don't // expose metadata in sandboxir. friend class Region; + friend class ScoreBoard; // Needs access to `Val` for the instruction cost. /// All values point to the context. Context &Ctx; diff --git a/llvm/lib/SandboxIR/Region.cpp b/llvm/lib/SandboxIR/Region.cpp index 1455012440f90..8c84d0c46fa10 100644 --- a/llvm/lib/SandboxIR/Region.cpp +++ b/llvm/lib/SandboxIR/Region.cpp @@ -11,7 +11,29 @@ namespace llvm::sandboxir { -Region::Region(Context &Ctx) : Ctx(Ctx) { +InstructionCost ScoreBoard::getCost(Instruction *I) const { + auto *LLVMI = cast(I->Val); + SmallVector Operands(LLVMI->operands()); + return TTI.getInstructionCost(LLVMI, Operands, CostKind); +} + +void ScoreBoard::remove(Instruction *I) { + auto Cost = getCost(I); + if (Rgn.contains(I)) + // If `I` is one the newly added ones, then we should adjust `AfterCost` + AfterCost -= Cost; + else + // If `I` is one of the original instructions (outside the region) then it + // is part of the original code, so adjust `BeforeCost`. + BeforeCost += Cost; +} + +#ifndef NDEBUG +void ScoreBoard::dump() const { dump(dbgs()); } +#endif + +Region::Region(Context &Ctx, TargetTransformInfo &TTI) + : Ctx(Ctx), Scoreboard(*this, TTI) { LLVMContext &LLVMCtx = Ctx.LLVMCtx; auto *RegionStrMD = MDString::get(LLVMCtx, RegionStr); RegionMDN = MDNode::getDistinct(LLVMCtx, {RegionStrMD}); @@ -31,9 +53,15 @@ void Region::add(Instruction *I) { Insts.insert(I); // TODO: Consider tagging instructions lazily. cast(I->Val)->setMetadata(MDKind, RegionMDN); + // Keep track of the instruction cost. + Scoreboard.add(I); } void Region::remove(Instruction *I) { + // Keep track of the instruction cost. This need to be done *before* we remove + // `I` from the region. + Scoreboard.remove(I); + Insts.remove(I); cast(I->Val)->setMetadata(MDKind, nullptr); } @@ -58,7 +86,8 @@ void Region::dump() const { } #endif // NDEBUG -SmallVector> Region::createRegionsFromMD(Function &F) { +SmallVector> +Region::createRegionsFromMD(Function &F, TargetTransformInfo &TTI) { SmallVector> Regions; DenseMap MDNToRegion; auto &Ctx = F.getContext(); @@ -68,7 +97,7 @@ SmallVector> Region::createRegionsFromMD(Function &F) { Region *R = nullptr; auto It = MDNToRegion.find(MDN); if (It == MDNToRegion.end()) { - Regions.push_back(std::make_unique(Ctx)); + Regions.push_back(std::make_unique(Ctx, TTI)); R = Regions.back().get(); MDNToRegion[MDN] = R; } else { diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp index 8e3f5b77429c5..121a195f45ee4 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp @@ -19,7 +19,7 @@ RegionsFromMetadata::RegionsFromMetadata(StringRef Pipeline) bool RegionsFromMetadata::runOnFunction(Function &F, const Analyses &A) { SmallVector> Regions = - sandboxir::Region::createRegionsFromMD(F); + sandboxir::Region::createRegionsFromMD(F, A.getTTI()); for (auto &R : Regions) { RPM.runOnRegion(*R, A); } diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index 751aedefd8fe2..19fce94563e48 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Pass.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Module.h" #include "llvm/SandboxIR/Constant.h" @@ -23,10 +24,13 @@ struct PassTest : public testing::Test { llvm::LLVMContext LLVMCtx; std::unique_ptr LLVMM; std::unique_ptr Ctx; + std::unique_ptr TTI; Function *parseFunction(const char *IR, const char *FuncName) { llvm::SMDiagnostic Err; LLVMM = parseAssemblyString(IR, Err, LLVMCtx); + TTI = std::make_unique(LLVMM->getDataLayout()); + if (!LLVMM) Err.print("PassTest", llvm::errs()); Ctx = std::make_unique(LLVMCtx); @@ -119,7 +123,7 @@ define i8 @foo(i8 %v0, i8 %v1) { EXPECT_EQ(TPass.getName(), "test-pass"); // Check runOnRegion(); llvm::SmallVector> Regions = - Region::createRegionsFromMD(*F); + Region::createRegionsFromMD(*F, *TTI); ASSERT_EQ(Regions.size(), 1u); TPass.runOnRegion(*Regions[0], Analyses::emptyForTesting()); EXPECT_EQ(InstCount, 2u); @@ -242,7 +246,7 @@ define i8 @foo(i8 %v0, i8 %v1) { RPM.addPass(std::make_unique(InstCount2)); // Check runOnRegion(). llvm::SmallVector> Regions = - Region::createRegionsFromMD(*F); + Region::createRegionsFromMD(*F, *TTI); ASSERT_EQ(Regions.size(), 1u); RPM.runOnRegion(*Regions[0], Analyses::emptyForTesting()); EXPECT_EQ(InstCount1, 2u); diff --git a/llvm/unittests/SandboxIR/RegionTest.cpp b/llvm/unittests/SandboxIR/RegionTest.cpp index 47368f93a32c0..1ee72d127daa4 100644 --- a/llvm/unittests/SandboxIR/RegionTest.cpp +++ b/llvm/unittests/SandboxIR/RegionTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Region.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/AsmParser/Parser.h" #include "llvm/SandboxIR/Context.h" #include "llvm/SandboxIR/Function.h" @@ -20,10 +21,12 @@ using namespace llvm; struct RegionTest : public testing::Test { LLVMContext C; std::unique_ptr M; + std::unique_ptr TTI; void parseIR(LLVMContext &C, const char *IR) { SMDiagnostic Err; M = parseAssemblyString(IR, Err, C); + TTI = std::make_unique(M->getDataLayout()); if (!M) Err.print("RegionTest", errs()); } @@ -45,7 +48,7 @@ define i8 @foo(i8 %v0, i8 %v1) { auto *T0 = cast(&*It++); auto *T1 = cast(&*It++); auto *Ret = cast(&*It++); - sandboxir::Region Rgn(Ctx); + sandboxir::Region Rgn(Ctx, *TTI); // Check getContext. EXPECT_EQ(&Ctx, &Rgn.getContext()); @@ -73,7 +76,7 @@ define i8 @foo(i8 %v0, i8 %v1) { #ifndef NDEBUG // Check equality comparison. Insert in reverse order into `Other` to check // that comparison is order-independent. - sandboxir::Region Other(Ctx); + sandboxir::Region Other(Ctx, *TTI); Other.add(Ret); EXPECT_NE(Rgn, Other); Other.add(T1); @@ -98,7 +101,7 @@ define i8 @foo(i8 %v0, i8 %v1, ptr %ptr) { auto *T0 = cast(&*It++); auto *T1 = cast(&*It++); auto *Ret = cast(&*It++); - sandboxir::Region Rgn(Ctx); + sandboxir::Region Rgn(Ctx, *TTI); Rgn.add(T0); Rgn.add(T1); @@ -134,7 +137,7 @@ define i8 @foo(i8 %v0, i8 %v1) { auto *T2 = cast(&*It++); SmallVector> Regions = - sandboxir::Region::createRegionsFromMD(*F); + sandboxir::Region::createRegionsFromMD(*F, *TTI); EXPECT_THAT(Regions[0]->insts(), testing::UnorderedElementsAre(T0)); EXPECT_THAT(Regions[1]->insts(), testing::UnorderedElementsAre(T1, T2)); } @@ -160,7 +163,7 @@ define i8 @foo(i8 %v0, i8 %v1) { auto *T2 = cast(&*It++); SmallVector> Regions = - sandboxir::Region::createRegionsFromMD(*F); + sandboxir::Region::createRegionsFromMD(*F, *TTI); EXPECT_THAT(Regions[0]->insts(), testing::UnorderedElementsAre(T0, T2)); } @@ -182,9 +185,9 @@ define i8 @foo(i8 %v0, i8 %v1) { [[maybe_unused]] auto *T1 = cast(&*It++); auto *T2 = cast(&*It++); [[maybe_unused]] auto *Ret = cast(&*It++); - sandboxir::Region Rgn(Ctx); + sandboxir::Region Rgn(Ctx, *TTI); Rgn.add(T0); - sandboxir::Region Rgn2(Ctx); + sandboxir::Region Rgn2(Ctx, *TTI); Rgn2.add(T2); std::string output; @@ -226,14 +229,66 @@ define i8 @foo(i8 %v0, i8 %v1) { auto *T0 = cast(&*It++); auto *T1 = cast(&*It++); - sandboxir::Region Rgn(Ctx); + sandboxir::Region Rgn(Ctx, *TTI); Rgn.add(T0); Rgn.add(T1); SmallVector> Regions = - sandboxir::Region::createRegionsFromMD(*F); + sandboxir::Region::createRegionsFromMD(*F, *TTI); ASSERT_EQ(1U, Regions.size()); #ifndef NDEBUG EXPECT_EQ(Rgn, *Regions[0].get()); #endif } + +TEST_F(RegionTest, RegionCost) { + parseIR(C, R"IR( +define void @foo(i8 %v0, i8 %v1, i8 %v2) { + %add0 = add i8 %v0, 1 + %add1 = add i8 %v1, 2 + %add2 = add i8 %v2, 3 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + auto *LLVMBB = &*LLVMF->begin(); + auto LLVMIt = LLVMBB->begin(); + auto *LLVMAdd0 = &*LLVMIt++; + auto *LLVMAdd1 = &*LLVMIt++; + auto *LLVMAdd2 = &*LLVMIt++; + + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Add0 = cast(&*It++); + auto *Add1 = cast(&*It++); + auto *Add2 = cast(&*It++); + + sandboxir::Region Rgn(Ctx, *TTI); + const auto &SB = Rgn.getScoreboard(); + EXPECT_EQ(SB.getAfterCost(), 0); + EXPECT_EQ(SB.getBeforeCost(), 0); + + auto GetCost = [this](llvm::Instruction *LLVMI) { + constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + SmallVector Operands(LLVMI->operands()); + return TTI->getInstructionCost(LLVMI, Operands, CostKind); + }; + // Add `Add0` to the region, should be counted in "After". + Rgn.add(Add0); + EXPECT_EQ(SB.getBeforeCost(), 0); + EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd0)); + // Same for `Add1`. + Rgn.add(Add1); + EXPECT_EQ(SB.getBeforeCost(), 0); + EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd0) + GetCost(LLVMAdd1)); + // Remove `Add0`, should be subtracted from "After". + Rgn.remove(Add0); + EXPECT_EQ(SB.getBeforeCost(), 0); + EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd1)); + // Remove `Add2` which was never in the region, should counted in "Before". + Rgn.remove(Add2); + EXPECT_EQ(SB.getBeforeCost(), GetCost(LLVMAdd2)); + EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd1)); +} From 05fd4d5775e2c40c00057d7af195290bc3a39cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 24 Jan 2025 14:32:07 -0800 Subject: [PATCH 056/432] [flang][cuda] Perform inlined assignment when field is c_devptr (#124322) When a field in a derived type is `c_devptr`, keep check if we can do a memcpy instead of falling back to the runtime assignment. Many internal CUDA Fortran derived type have a `c_devptr` field and this would lead to stack overflow on the device if the assignment is performed by the runtime function. --- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 3 ++- flang/test/Lower/CUDA/cuda-devptr.cuf | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 64c540cfb95ae..35dc9a2abd69c 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -1410,7 +1410,8 @@ static bool recordTypeCanBeMemCopied(fir::RecordType recordType) { for (auto [_, fieldType] : recordType.getTypeList()) { // Derived type component may have user assignment (so far, we cannot tell // in FIR, so assume it is always the case, TODO: get the actual info). - if (mlir::isa(fir::unwrapSequenceType(fieldType))) + if (mlir::isa(fir::unwrapSequenceType(fieldType)) && + !fir::isa_builtin_c_devptr_type(fir::unwrapSequenceType(fieldType))) return false; // Allocatable components need deep copy. if (auto boxType = mlir::dyn_cast(fieldType)) diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf index d61d84d9bc750..0a9087cf6c133 100644 --- a/flang/test/Lower/CUDA/cuda-devptr.cuf +++ b/flang/test/Lower/CUDA/cuda-devptr.cuf @@ -4,6 +4,12 @@ module cudafct use __fortran_builtins, only : c_devptr => __builtin_c_devptr + + type :: t1 + type(c_devptr) :: devp + integer :: a + end type + contains function c_devloc(x) use iso_c_binding, only: c_loc @@ -12,6 +18,10 @@ contains real, target, device :: x c_devloc%cptr = c_loc(x) end function + + attributes(device) function get_t1() + type(t1) :: get_t1 + end end subroutine sub1() @@ -68,3 +78,12 @@ end subroutine ! CHECK: %[[P_ADDR_COORD:.*]] = fir.coordinate_of %[[P_CPTR_COORD]], %[[ADDRESS_FIELD]] : (!fir.ref>, !fir.field) -> !fir.ref ! CHECK: %[[ADDR:.*]] = fir.load %[[RES_ADDR_COORD]] : !fir.ref ! CHECK: fir.store %[[ADDR]] to %[[P_ADDR_COORD]] : !fir.ref + +attributes(global) subroutine assign_nested_c_devptr(p, a) + use cudafct + type(t1), device :: p + p = get_t1() +end subroutine + +! CHECK-LABEL: func.func @_QPassign_nested_c_devptr +! CHECK-NOT: fir.call @_FortranAAssign From cff7ad56babc2e8e7c731b3f60d3c0b4c8aca96f Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 24 Jan 2025 14:35:20 -0800 Subject: [PATCH 057/432] [SandboxVec][Utils] Implement Utils::verifyFunction() (#124356) This patch implements a wrapper function for the LLVM IR verifier for functions, and calls it (flag-guarded) within the bottom-up-vectorizer for finding IR bugs as soon as they happen. --- llvm/include/llvm/SandboxIR/Utils.h | 9 +++++++++ .../SandboxVectorizer/Passes/BottomUpVec.cpp | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/Utils.h b/llvm/include/llvm/SandboxIR/Utils.h index d58fe52214395..5c6f0d9edd618 100644 --- a/llvm/include/llvm/SandboxIR/Utils.h +++ b/llvm/include/llvm/SandboxIR/Utils.h @@ -17,6 +17,8 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Verifier.h" +#include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" #include @@ -122,6 +124,13 @@ class Utils { const std::optional &OptLoc) { return BatchAA.getModRefInfo(cast(I->Val), OptLoc); } + + /// Equivalent to llvm::verifyFunction(). + /// \Returns true if the IR is broken. + static bool verifyFunction(const Function *F, raw_ostream &OS) { + const auto &LLVMF = *cast(F->Val); + return llvm::verifyFunction(LLVMF, &OS); + } }; } // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 7cebde335cb4e..b3a477c64a5cc 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -27,6 +27,13 @@ static cl::opt AllowNonPow2("sbvec-allow-non-pow2", cl::init(false), cl::Hidden, cl::desc("Allow non-power-of-2 vectorization.")); +#ifndef NDEBUG +static cl::opt + AlwaysVerify("sbvec-always-verify", cl::init(false), cl::Hidden, + cl::desc("Helps find bugs by verifying the IR whenever we " + "emit new instructions (*very* expensive).")); +#endif // NDEBUG + namespace sandboxir { BottomUpVec::BottomUpVec(StringRef Pipeline) @@ -365,6 +372,17 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, break; } } +#ifndef NDEBUG + if (AlwaysVerify) { + // This helps find broken IR by constantly verifying the function. Note that + // this is very expensive and should only be used for debugging. + Instruction *I0 = isa(Bndl[0]) + ? cast(Bndl[0]) + : cast(UserBndl[0]); + assert(!Utils::verifyFunction(I0->getParent()->getParent(), dbgs()) && + "Broken function!"); + } +#endif return NewVec; } From d910fbcbd10c5e72d0771dd9607e7133ae51dc70 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Fri, 24 Jan 2025 14:46:01 -0800 Subject: [PATCH 058/432] [RISCV][NFC] cR Constraint Release Note --- llvm/docs/ReleaseNotes.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 05d902641d093..0872f20bea590 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -272,6 +272,9 @@ Changes to the RISC-V Backend * `cf` constraint meaning an RVC-encoding compatible FPR (`f8`-`f15`) * `R` constraint meaning an even-odd GPR pair (prints as the even register, but both registers in the pair are considered live). + * `cR` constraint meaning an RVC-encoding compatible even-odd GPR Pair (prints + as an even register between `x8` and `x14`, but both registers in the pair + are considered live). * `N` modifer meaning print the register encoding (0-31) rather than the name. * `f` and `cf` inline assembly constraints, when using F-/D-/H-in-X extensions, will use the relevant GPR rather than FPR. This makes inline assembly portable From ac75d322801411f496fe5d1155c86453f915ae98 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 24 Jan 2025 14:52:57 -0800 Subject: [PATCH 059/432] [SandboxVec][VecUtils] Filter out instructions not in BB in VecUtils:getLowest() (#124360) This patch changes the functionality of `VecUtils::getLowest(Vals, BB)` such that it filters out any instructions in `Vals` that are not in BB. This is useful when Vals contains instructions from different BBs, because in that case we are only interested in one BB. --- .../Vectorize/SandboxVectorizer/VecUtils.h | 17 ++++++------ .../SandboxVectorizer/Passes/BottomUpVec.cpp | 2 +- .../Transforms/SandboxVectorizer/cross_bbs.ll | 4 +-- .../test/Transforms/SandboxVectorizer/pack.ll | 4 +-- .../SandboxVectorizer/VecUtilsTest.cpp | 27 ++++++++++++------- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h index 64090febc5a09..bec1cecf241f6 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h @@ -111,10 +111,12 @@ class VecUtils { return LowestI; } /// \Returns the lowest instruction in \p Vals, or nullptr if no instructions - /// are found or if not in the same BB. - static Instruction *getLowest(ArrayRef Vals) { - // Find the first Instruction in Vals. - auto It = find_if(Vals, [](Value *V) { return isa(V); }); + /// are found. Skips instructions not in \p BB. + static Instruction *getLowest(ArrayRef Vals, BasicBlock *BB) { + // Find the first Instruction in Vals that is also in `BB`. + auto It = find_if(Vals, [BB](Value *V) { + return isa(V) && cast(V)->getParent() == BB; + }); // If we couldn't find an instruction return nullptr. if (It == Vals.end()) return nullptr; @@ -122,15 +124,14 @@ class VecUtils { // Now look for the lowest instruction in Vals starting from one position // after FirstI. Instruction *LowestI = FirstI; - auto *LowestBB = LowestI->getParent(); for (auto *V : make_range(std::next(It), Vals.end())) { auto *I = dyn_cast(V); // Skip non-instructions. if (I == nullptr) continue; - // If the instructions are in different BBs return nullptr. - if (I->getParent() != LowestBB) - return nullptr; + // Skips instructions not in \p BB. + if (I->getParent() != BB) + continue; // If `LowestI` comes before `I` then `I` is the new lowest. if (LowestI->comesBefore(I)) LowestI = I; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index b3a477c64a5cc..6f65657d29790 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -54,7 +54,7 @@ static SmallVector getOperand(ArrayRef Bndl, /// of BB if no instruction found in \p Vals. static BasicBlock::iterator getInsertPointAfterInstrs(ArrayRef Vals, BasicBlock *BB) { - auto *BotI = VecUtils::getLastPHIOrSelf(VecUtils::getLowest(Vals)); + auto *BotI = VecUtils::getLastPHIOrSelf(VecUtils::getLowest(Vals, BB)); if (BotI == nullptr) // We are using BB->begin() (or after PHIs) as the fallback insert point. return BB->empty() diff --git a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll index e913fc5913ba7..6ec31060d7e0f 100644 --- a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll +++ b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll @@ -8,10 +8,10 @@ define void @cross_bbs(ptr %ptr) { ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr i8, ptr [[PTR]], i32 1 ; CHECK-NEXT: [[L0:%.*]] = load i8, ptr [[PTR0]], align 1 ; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[PTR1]], align 1 -; CHECK-NEXT: [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[L0]], i32 0 -; CHECK-NEXT: [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[L1]], i32 1 ; CHECK-NEXT: br label %[[BB:.*]] ; CHECK: [[BB]]: +; CHECK-NEXT: [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[L0]], i32 0 +; CHECK-NEXT: [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[L1]], i32 1 ; CHECK-NEXT: store <2 x i8> [[PACK1]], ptr [[PTR0]], align 1 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll index 373ab743fb890..a0aa2a79a0ade 100644 --- a/llvm/test/Transforms/SandboxVectorizer/pack.ll +++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll @@ -59,12 +59,12 @@ define void @packFromOtherBB(ptr %ptr, i8 %val) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[ADD0:%.*]] = add i8 [[VAL]], 0 ; CHECK-NEXT: [[MUL1:%.*]] = mul i8 [[VAL]], 1 -; CHECK-NEXT: [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[ADD0]], i32 0 -; CHECK-NEXT: [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[MUL1]], i32 1 ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[PHI0:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ 1, %[[LOOP]] ] ; CHECK-NEXT: [[PHI1:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ 1, %[[LOOP]] ] +; CHECK-NEXT: [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[ADD0]], i32 0 +; CHECK-NEXT: [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[MUL1]], i32 1 ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 ; CHECK-NEXT: store <2 x i8> [[PACK1]], ptr [[GEP0]], align 1 ; CHECK-NEXT: br label %[[LOOP]] diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp index a46e47afea3c7..5c062df8112f6 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp @@ -461,24 +461,33 @@ define void @foo(i8 %v) { // Check getLowest(ArrayRef) SmallVector C1Only({C1}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only), nullptr); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only, &BB), nullptr); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only, &BB0), nullptr); SmallVector AOnly({IA}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly, &BB), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly, &BB0), nullptr); SmallVector AC1({IA, C1}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1, &BB), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1, &BB0), nullptr); SmallVector C1A({C1, IA}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A, &BB), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A, &BB0), nullptr); SmallVector AC1B({IA, C1, IB}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B), IB); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B, &BB), IB); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B, &BB0), nullptr); SmallVector ABC1({IA, IB, C1}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1), IB); + EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1, &BB), IB); + EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1, &BB0), nullptr); SmallVector AC1C2({IA, C1, C2}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2, &BB), IA); + EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2, &BB0), nullptr); SmallVector C1C2C3({C1, C2, C3}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3), nullptr); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3, &BB), nullptr); + EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3, &BB0), nullptr); SmallVector DiffBBs({BB0I, IA}); - EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs), nullptr); + EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs, &BB0), BB0I); + EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs, &BB), IA); } TEST_F(VecUtilsTest, GetLastPHIOrSelf) { From 4df9c17e5f436702ca4f5439322972b0385d629a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 24 Jan 2025 23:59:00 +0100 Subject: [PATCH 060/432] [libc++] Fix tests for clang::no_specializations for C++17 and C++20 --- libcxx/include/__type_traits/result_of.h | 2 +- .../ranges/no_specializations.verify.cpp | 4 +++- .../type_traits/no_specializations.verify.cpp | 24 ++++++++++++------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h index 217ca70b4cd20..8cc009dbe8baa 100644 --- a/libcxx/include/__type_traits/result_of.h +++ b/libcxx/include/__type_traits/result_of.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS) template -struct _LIBCPP_DEPRECATED_IN_CXX17 result_of; +struct _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS result_of; template struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {}; diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp index 69d458a920558..489e3a6a73744 100644 --- a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp @@ -13,7 +13,9 @@ #include -#if !__has_warning("-Winvalid-specialization") +#include "test_macros.h" + +#if !__has_warning("-Winvalid-specialization") || TEST_STD_VER <= 20 // expected-no-diagnostics #else struct S {}; diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index e6d960667e8c0..807d01e381b49 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -36,15 +36,22 @@ SPECIALIZE_TRAIT(make_unsigned); // expected-error {{cannot be specialize SPECIALIZE_TRAIT(remove_all_extents); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_const); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_cv); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_extent); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_pointer); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_reference); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_volatile); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(underlying_type); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} + +# if TEST_STD_VER <= 17 +SPECIALIZE_TRAIT(result_of); // expected-error {{cannot be specialized}} +# endif + +# if TEST_STD_VER >= 20 +SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} +# endif # undef SPECIALIZE_TRAIT # define SPECIALIZE_UTT(Trait) \ @@ -96,7 +103,6 @@ SPECIALIZE_UTT(is_move_assignable); // expected-error 2 {{cannot SPECIALIZE_UTT(is_move_constructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_BTT(is_nothrow_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_constructible); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_copy_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_copy_constructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_default_constructible); // expected-error 2 {{cannot be specialized}} @@ -130,7 +136,6 @@ SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot SPECIALIZE_UTT(is_trivially_destructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_trivially_move_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_trivially_move_constructible); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_union); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_unsigned); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_void); // expected-error 2 {{cannot be specialized}} @@ -140,11 +145,12 @@ SPECIALIZE_UTT(rank); // expected-error 2 {{cannot # if TEST_STD_VER <= 17 SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(result_of); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 20 -SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 23 @@ -171,6 +177,8 @@ struct std::conditional; // expected-error {{cannot be specialized}} template <> struct std::enable_if; // expected-error {{cannot be specialized}} +#if TEST_STD_VER >= 20 template <> struct std::integral_constant; // expected-error {{cannot be specialized}} #endif +#endif From e2005d1461942539f7533a518aa78017074f6bf9 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Fri, 24 Jan 2025 14:59:56 -0800 Subject: [PATCH 061/432] [LLDB] Reapply #123873 SBSaveCore Docstrings (#124355) In my last attempt at this (#123873), I didn't realize we needed semi colons! Also fixed the bug that the feature summary didn't have a type defined. CC @JDevlieghere hope you get a laugh at needing to revert doc strings for breaking the build.... --- .../interface/SBSaveCoreOptionsDocstrings.i | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i index e69de29bb2d1d..08bbdf89d68de 100644 --- a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i +++ b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i @@ -0,0 +1,71 @@ +%feature("docstring", +"A container to specify how to save a core file. + +SBSaveCoreOptions includes API's to specify the memory regions and threads to include +when generating a core file. It extends the existing SaveCoreStyle option. + +* eSaveCoreFull will save off all thread and memory regions, ignoring the memory regions and threads in +the options object. + +* eSaveCoreDirtyOnly pages will capture all threads and all rw- memory regions, in addition to the regions specified +in the options object if they are not already captured. + +* eSaveCoreStackOnly will capture all threads, but no memory regions unless specified. + +* eSaveCoreCustomOnly Custom defers entirely to the SBSaveCoreOptions object and will only save what is specified. + Picking custom and specifying nothing will result in an error being returned. + +Note that currently ELF Core files are not supported." +) lldb::SBSaveCoreOptions; + +%feature("docstring", " + Set the plugin name to save a Core file with. Only plugins registered with Plugin manager will be accepted + Examples are Minidump and Mach-O." +) lldb::SBSaveCoreOptions::SetPluginName; + +%feature("docstring", " + Get the specified plugin name, or None if the name is not set." +) lldb::SBSaveCoreOptions::GetPluginName; + +%feature("docstring", " + Set the lldb.SaveCoreStyle." +) lldb::SBSaveCoreOptions::SetStyle; + +%feature("docstring", " + Get the specified lldb.SaveCoreStyle, or eSaveCoreUnspecified if not set." +) lldb::SBSaveCoreOptions::GetStyle; + +%feature("docstring", " + Set the file path to save the Core file at." +) lldb::SBSaveCoreOptions::SetOutputFile; + +%feature("docstring", " + Get an SBFileSpec corresponding to the specified output path, or none if not set." +) lldb::SBSaveCoreOptions::GetOutputFile; + +%feature("docstring", " + Set the process to save, or unset a process by providing a default SBProcess. + Resetting will result in the reset of all process specific options, such as Threads to save." +) lldb::SBSaveCoreOptions::SetProcess; + +%feature("docstring", " + Add an SBThread to be saved, an error will be returned if an SBThread from a different process is specified. + The process is set either by the first SBThread added to the options container, or explicitly by the SetProcess call." +) lldb::SBSaveCoreOptions::AddThread; + +%feature("docstring", " + Remove an SBthread if present in the container, returns true if a matching thread was found and removed." +) lldb::SBSaveCoreOptions::RemoveThread; + +%feature("docstring", " + Add a memory region to save, an error will be returned in the region is invalid. + Ranges that overlap will be unioned into a single region." +) lldb::SBSaveCoreOptions::AddMemoryRegionToSave; + +%feature("docstring", " + Get an SBThreadCollection of all threads marked to be saved. This collection is not sorted according to insertion order." +) lldb::SBSaveCoreOptions::GetThreadsToSave; + +%feature("docstring", " + Unset all options." +) lldb::SBSaveCoreOptions::Clear; From 241e5d8c5c424155e02e05524e8f731fc524aa40 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 24 Jan 2025 18:15:40 -0500 Subject: [PATCH 062/432] [AMDGPU][True16][MC] true16 for v_cmpx_eq_f16 (#124038) True16 format for v_cmpx_eq_f16. Also cleaned up some stray gfx11 check line in gfx12 dasm test --- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 2 +- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s | 65 ++++---- .../AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s | 25 +++- .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s | 14 +- llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s | 75 ++++++---- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s | 65 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s | 21 ++- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s | 42 ++++-- .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s | 42 ++++-- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s | 14 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s | 73 +++++---- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s | 33 +++-- llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s | 72 +++++---- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s | 62 ++++---- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s | 18 ++- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s | 42 ++++-- .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s | 42 ++++-- .../gfx11_dasm_vop3_dpp16_from_vopcx.txt | 54 +++++-- .../gfx11_dasm_vop3_dpp8_from_vopcx.txt | 24 ++- .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt | 14 +- .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt | 65 ++++++-- .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt | 54 +++++-- .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt | 24 ++- .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt | 28 ++-- .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt | 140 +++++++++++------- .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt | 64 +++----- .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt | 61 ++++++-- .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt | 50 +++++-- .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt | 14 +- 29 files changed, 845 insertions(+), 454 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index aa930249c5003..46cad585b8a82 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1977,7 +1977,7 @@ defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; defm V_CMPX_F_F16_fake16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; defm V_CMPX_LT_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; -defm V_CMPX_EQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; +defm V_CMPX_EQ_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; defm V_CMPX_LE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; defm V_CMPX_GT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; defm V_CMPX_LG_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s index 80264a4a791bb..e946097f35e23 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s @@ -96,47 +96,56 @@ v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] -v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s index 119e4826b3277..e60406078f745 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s @@ -29,17 +29,26 @@ v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfe,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] -v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s index 1614f00e1f07e..799a8f86a01e9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s @@ -149,11 +149,11 @@ v_cmpx_class_f64_e64 -|src_scc|, src_scc v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 // GFX11: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_eq_f16_e64 v1, v2 -// GFX11: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_eq_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_eq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_eq_f16_e64 v255, v255 -// GFX11: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_eq_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_eq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_f16_e64 s1, s2 // GFX11: v_cmpx_eq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00] @@ -194,6 +194,12 @@ v_cmpx_eq_f16_e64 -src_scc, |vcc_lo| v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_eq_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_eq_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_eq_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_eq_f32_e64 v1, v2 // GFX11: v_cmpx_eq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s index cdad89321d89a..88d9fb6cc1357 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s @@ -143,50 +143,65 @@ v_cmpx_class_f64 src_scc, v2 v_cmpx_class_f64 0xaf123456, v255 // GFX11: v_cmpx_class_f64_e32 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_eq_f16 v1, v2 -// GFX11: v_cmpx_eq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x04,0x7d] +v_cmpx_eq_f16 v1.l, v2.l +// GFX11: v_cmpx_eq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x04,0x7d] -v_cmpx_eq_f16 v127, v2 -// GFX11: v_cmpx_eq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x04,0x7d] +v_cmpx_eq_f16 v127.l, v2.l +// GFX11: v_cmpx_eq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x04,0x7d] -v_cmpx_eq_f16 s1, v2 -// GFX11: v_cmpx_eq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x04,0x7d] +v_cmpx_eq_f16 s1, v2.l +// GFX11: v_cmpx_eq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x04,0x7d] -v_cmpx_eq_f16 s105, v2 -// GFX11: v_cmpx_eq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x04,0x7d] +v_cmpx_eq_f16 s105, v2.l +// GFX11: v_cmpx_eq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x04,0x7d] -v_cmpx_eq_f16 vcc_lo, v2 -// GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x04,0x7d] +v_cmpx_eq_f16 vcc_lo, v2.l +// GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x04,0x7d] -v_cmpx_eq_f16 vcc_hi, v2 -// GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x04,0x7d] +v_cmpx_eq_f16 vcc_hi, v2.l +// GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x04,0x7d] -v_cmpx_eq_f16 ttmp15, v2 -// GFX11: v_cmpx_eq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x04,0x7d] +v_cmpx_eq_f16 ttmp15, v2.l +// GFX11: v_cmpx_eq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x04,0x7d] -v_cmpx_eq_f16 m0, v2 -// GFX11: v_cmpx_eq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x04,0x7d] +v_cmpx_eq_f16 m0, v2.l +// GFX11: v_cmpx_eq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x04,0x7d] -v_cmpx_eq_f16 exec_lo, v2 -// GFX11: v_cmpx_eq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x04,0x7d] +v_cmpx_eq_f16 exec_lo, v2.l +// GFX11: v_cmpx_eq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x04,0x7d] -v_cmpx_eq_f16 exec_hi, v2 -// GFX11: v_cmpx_eq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x04,0x7d] +v_cmpx_eq_f16 exec_hi, v2.l +// GFX11: v_cmpx_eq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x04,0x7d] -v_cmpx_eq_f16 null, v2 -// GFX11: v_cmpx_eq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x04,0x7d] +v_cmpx_eq_f16 null, v2.l +// GFX11: v_cmpx_eq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x04,0x7d] -v_cmpx_eq_f16 -1, v2 -// GFX11: v_cmpx_eq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x04,0x7d] +v_cmpx_eq_f16 -1, v2.l +// GFX11: v_cmpx_eq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x04,0x7d] -v_cmpx_eq_f16 0.5, v2 -// GFX11: v_cmpx_eq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x04,0x7d] +v_cmpx_eq_f16 0.5, v2.l +// GFX11: v_cmpx_eq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x04,0x7d] -v_cmpx_eq_f16 src_scc, v2 -// GFX11: v_cmpx_eq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x04,0x7d] +v_cmpx_eq_f16 src_scc, v2.l +// GFX11: v_cmpx_eq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x04,0x7d] -v_cmpx_eq_f16 0xfe0b, v127 -// GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_f16 0xfe0b, v127.l +// GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_eq_f16 v1.h, v2.l +// GFX11: v_cmpx_eq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x04,0x7d] + +v_cmpx_eq_f16 v127.h, v2.l +// GFX11: v_cmpx_eq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x04,0x7d] + +v_cmpx_eq_f16 0.5, v127.l +// GFX11: v_cmpx_eq_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x04,0x7d] + +v_cmpx_eq_f16 src_scc, v2.h +// GFX11: v_cmpx_eq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x05,0x7d] + +v_cmpx_eq_f16 0xfe0b, v127.h +// GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_eq_f32 v1, v2 // GFX11: v_cmpx_eq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x24,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s index ddaa30af953b8..e8d458874596e 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s @@ -95,47 +95,56 @@ v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x35,0x30] -v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_eq_f16 v1, v2 row_mirror -// GFX11: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_eq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_eq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s index 1cead89c0a82e..4f8895faf10a2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s @@ -29,14 +29,23 @@ v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s index 5cab502e99647..fe2220d7f5902 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s @@ -37,23 +37,41 @@ v_cmpx_class_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_class_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_eq_i16_e32 v1.h, v255.h // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s index 5102a32075066..c6814de818e6d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s @@ -37,23 +37,41 @@ v_cmpx_class_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_eq_f16 v1, v255 -// GFX11: v_cmpx_eq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_eq_f16 v1.h, v255.h +// GFX11: v_cmpx_eq_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16 v255, v2 -// GFX11: v_cmpx_eq_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_eq_f16 v1.l, v255.l +// GFX11: v_cmpx_eq_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_eq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_eq_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_f16 v255.h, v2.h +// GFX11: v_cmpx_eq_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_eq_f16 v255.l, v2.l +// GFX11: v_cmpx_eq_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_eq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_i16 v1.h, v255.h // GFX11: v_cmpx_eq_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s index d7bec00b83080..17bd81fa7d259 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s @@ -146,11 +146,11 @@ v_cmpx_class_f64_e64 -|src_scc|, src_scc v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 // GFX12: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_eq_f16_e64 v1, v2 -// GFX12: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_eq_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_eq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_eq_f16_e64 v255, v255 -// GFX12: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_eq_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_eq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] v_cmpx_eq_f16_e64 s1, s2 // GFX12: v_cmpx_eq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00] @@ -191,6 +191,12 @@ v_cmpx_eq_f16_e64 -src_scc, |vcc_lo| v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_eq_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_eq_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_eq_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_eq_f32_e64 v1, v2 // GFX12: v_cmpx_eq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s index faad68f902d5f..86f4b9a6789dd 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s @@ -107,53 +107,62 @@ v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30] -v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s index 588ad2b75a410..071a00ac73b8a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s @@ -41,23 +41,32 @@ v_cmpx_class_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfe,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] -v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s index 4d43b98978eb5..ab01d37c39d3e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s @@ -140,50 +140,62 @@ v_cmpx_class_f64 src_scc, v2 v_cmpx_class_f64 0xaf123456, v255 // GFX12: v_cmpx_class_f64_e32 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_eq_f16 v1, v2 -// GFX12: v_cmpx_eq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x04,0x7d] +v_cmpx_eq_f16 v1.l, v2.l +// GFX12: v_cmpx_eq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x04,0x7d] -v_cmpx_eq_f16 v127, v2 -// GFX12: v_cmpx_eq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x04,0x7d] +v_cmpx_eq_f16 v127.l, v2.l +// GFX12: v_cmpx_eq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x04,0x7d] -v_cmpx_eq_f16 s1, v2 -// GFX12: v_cmpx_eq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x04,0x7d] +v_cmpx_eq_f16 s1, v2.l +// GFX12: v_cmpx_eq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x04,0x7d] -v_cmpx_eq_f16 s105, v2 -// GFX12: v_cmpx_eq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x04,0x7d] +v_cmpx_eq_f16 s105, v2.l +// GFX12: v_cmpx_eq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x04,0x7d] -v_cmpx_eq_f16 vcc_lo, v2 -// GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x04,0x7d] +v_cmpx_eq_f16 vcc_lo, v2.l +// GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x04,0x7d] -v_cmpx_eq_f16 vcc_hi, v2 -// GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x04,0x7d] +v_cmpx_eq_f16 vcc_hi, v2.l +// GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x04,0x7d] -v_cmpx_eq_f16 ttmp15, v2 -// GFX12: v_cmpx_eq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x04,0x7d] +v_cmpx_eq_f16 ttmp15, v2.l +// GFX12: v_cmpx_eq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x04,0x7d] -v_cmpx_eq_f16 m0, v2 -// GFX12: v_cmpx_eq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x04,0x7d] +v_cmpx_eq_f16 m0, v2.l +// GFX12: v_cmpx_eq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x04,0x7d] -v_cmpx_eq_f16 exec_lo, v2 -// GFX12: v_cmpx_eq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x04,0x7d] +v_cmpx_eq_f16 exec_lo, v2.l +// GFX12: v_cmpx_eq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x04,0x7d] -v_cmpx_eq_f16 exec_hi, v2 -// GFX12: v_cmpx_eq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x04,0x7d] +v_cmpx_eq_f16 exec_hi, v2.l +// GFX12: v_cmpx_eq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x04,0x7d] -v_cmpx_eq_f16 null, v2 -// GFX12: v_cmpx_eq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x04,0x7d] +v_cmpx_eq_f16 null, v2.l +// GFX12: v_cmpx_eq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x04,0x7d] -v_cmpx_eq_f16 -1, v2 -// GFX12: v_cmpx_eq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x04,0x7d] +v_cmpx_eq_f16 -1, v2.l +// GFX12: v_cmpx_eq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x04,0x7d] -v_cmpx_eq_f16 0.5, v2 -// GFX12: v_cmpx_eq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x04,0x7d] +v_cmpx_eq_f16 0.5, v2.l +// GFX12: v_cmpx_eq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x04,0x7d] -v_cmpx_eq_f16 src_scc, v2 -// GFX12: v_cmpx_eq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x04,0x7d] +v_cmpx_eq_f16 src_scc, v2.l +// GFX12: v_cmpx_eq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x04,0x7d] -v_cmpx_eq_f16 0xfe0b, v127 -// GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_eq_f16 0xfe0b, v127.l +// GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_eq_f16 v1.h, v2.l +// GFX12: v_cmpx_eq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x04,0x7d] + +v_cmpx_eq_f16 v127.h, v2.l +// GFX12: v_cmpx_eq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x04,0x7d] + +v_cmpx_eq_f16 src_scc, v2.h +// GFX12: v_cmpx_eq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x05,0x7d] + +v_cmpx_eq_f16 0xfe0b, v127.h +// GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_eq_f32 v1, v2 // GFX12: v_cmpx_eq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x24,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s index 5c54d1ad5788c..2b919fa9d671e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s @@ -92,47 +92,53 @@ v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x35,0x30] -v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_eq_f16 v1, v2 row_mirror -// GFX12: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s index c6e7fd1aa96da..11579786d78a8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s @@ -26,14 +26,20 @@ v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s index 7ba3aff6c80ca..265ab2c8ff66d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s @@ -37,23 +37,41 @@ v_cmpx_class_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_class_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_eq_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_eq_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_eq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_eq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_eq_i16_e32 v1.h, v255.h // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s index b7423dcde03d4..ed228c061d019 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s @@ -37,23 +37,41 @@ v_cmpx_class_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_eq_f16 v1, v255 -// GFX12: v_cmpx_eq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_eq_f16 v1.h, v255.h +// GFX12: v_cmpx_eq_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_eq_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_eq_f16 v255, v2 -// GFX12: v_cmpx_eq_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_eq_f16 v1.l, v255.l +// GFX12: v_cmpx_eq_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_eq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_eq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_eq_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_eq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_eq_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_eq_f16 v255.h, v2.h +// GFX12: v_cmpx_eq_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_eq_f16 v255.l, v2.l +// GFX12: v_cmpx_eq_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_eq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_eq_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_eq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_eq_i16 v1.h, v255.h // GFX12: v_cmpx_eq_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt index 20250c1df729e..b73c7f83c7442 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt @@ -115,46 +115,72 @@ # GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt index a1ef8f36e77be..0b7e14108848c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt @@ -31,16 +31,32 @@ # GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt index a3e9f92454e3a..cd897944845a0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt @@ -160,10 +160,12 @@ # GFX11: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_eq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_eq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00] @@ -204,6 +206,14 @@ 0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_eq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt index f058a9b981625..90a95138144f1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt @@ -166,49 +166,84 @@ # GFX11: v_cmpx_class_f64_e32 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x04,0x7d] 0x7f,0x05,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x04,0x7d] 0x01,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x04,0x7d] 0x69,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x04,0x7d] 0x6a,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x04,0x7d] 0x6b,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x04,0x7d] 0x7b,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x04,0x7d] 0x7d,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x04,0x7d] 0x7e,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x04,0x7d] 0x7f,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x04,0x7d] 0x7c,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x04,0x7d] 0xc1,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x04,0x7d] 0xf0,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x04,0x7d] 0xfd,0x04,0x04,0x7d -# GFX11: v_cmpx_eq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x04,0x7d] +# GFX11-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x04,0x7d] 0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x04,0x7d +# GFX11-REAL16: v_cmpx_eq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x04,0x7d] + +0xff,0x05,0x04,0x7d +# GFX11-REAL16: v_cmpx_eq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x04,0x7d] + +0xf0,0xfe,0x04,0x7d +# GFX11-REAL16: v_cmpx_eq_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x04,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x04,0x7d] + +0xfd,0x04,0x05,0x7d +# GFX11-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x05,0x7d] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x05,0x7d] + +0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x24,0x7d # GFX11: v_cmpx_eq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x24,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt index 5a57f93c65939..9b9b423a7b104 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt @@ -115,46 +115,72 @@ # GFX11: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30] 0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_eq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_eq_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_eq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt index 8350088ca95a5..6ca58524688e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt @@ -31,10 +31,30 @@ # GFX11: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_eq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt index 80235451fec6f..d3a19ae00fa21 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt @@ -156,10 +156,12 @@ # GFX12: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_eq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00] @@ -200,6 +202,15 @@ 0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_eq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00] @@ -336,7 +347,6 @@ # GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_eq_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00] @@ -474,7 +484,6 @@ # GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_eq_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00] @@ -738,7 +747,6 @@ # GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ge_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00] @@ -876,7 +884,6 @@ # GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ge_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00] @@ -1140,7 +1147,6 @@ # GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_gt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00] @@ -1278,7 +1284,6 @@ # GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_gt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00] @@ -1542,7 +1547,6 @@ # GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_le_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00] @@ -1680,7 +1684,6 @@ # GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_le_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00] @@ -2080,7 +2083,6 @@ # GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_lt_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00] @@ -2218,7 +2220,6 @@ # GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_lt_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00] @@ -2356,7 +2357,6 @@ # GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255 ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ne_i32_e64 v1, v2 ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00] @@ -2494,8 +2494,6 @@ # GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255 ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00] -# GFX11: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00] - 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ne_u32_e64 v1, v2 ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt index 2dc231a4220f1..a1061a067d73c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt @@ -171,49 +171,123 @@ # GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_eq_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_eq_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_eq_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_eq_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_eq_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -367,17 +441,14 @@ 0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -537,17 +608,14 @@ 0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -794,17 +862,14 @@ 0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -961,17 +1026,14 @@ 0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -1218,17 +1280,14 @@ 0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -1385,17 +1444,14 @@ 0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -1642,17 +1698,14 @@ 0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -1809,17 +1862,14 @@ 0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -2230,17 +2280,14 @@ 0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -2397,17 +2444,14 @@ 0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -2564,17 +2608,14 @@ 0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] @@ -2731,17 +2772,14 @@ 0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -# GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt index cff9497778265..56e1ea1194a5c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt @@ -38,19 +38,37 @@ # GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -82,17 +100,14 @@ # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -118,17 +133,14 @@ # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -187,17 +199,14 @@ # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -223,17 +232,14 @@ # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -289,17 +295,14 @@ # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -325,17 +328,14 @@ # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -391,17 +391,14 @@ # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -427,17 +424,14 @@ # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -540,17 +534,14 @@ # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -576,17 +567,14 @@ # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -612,17 +600,14 @@ # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] @@ -651,17 +636,14 @@ # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -# GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt index 6ca815a1c88d3..b458e613d2f20 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt @@ -162,49 +162,80 @@ # GFX12: v_cmpx_class_f64_e32 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x04,0x7d] 0x7f,0x05,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x04,0x7d] 0x01,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x04,0x7d] 0x69,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x04,0x7d] 0x6a,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x04,0x7d] 0x6b,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x04,0x7d] 0x7b,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x04,0x7d] 0x7d,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x04,0x7d] 0x7e,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x04,0x7d] 0x7f,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x04,0x7d] 0x7c,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x04,0x7d] 0xc1,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x04,0x7d] 0xf0,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x04,0x7d] 0xfd,0x04,0x04,0x7d -# GFX12: v_cmpx_eq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x04,0x7d] +# GFX12-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x04,0x7d] 0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x04,0x7d +# GFX12-REAL16: v_cmpx_eq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x04,0x7d] + +0xff,0x05,0x04,0x7d +# GFX12-REAL16: v_cmpx_eq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x04,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x04,0x7d] + +0xfd,0x04,0x05,0x7d +# GFX12-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x05,0x7d] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x05,0x7d] + +0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x24,0x7d # GFX12: v_cmpx_eq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x24,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt index f1fca29120490..0289ed58f07e3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt @@ -111,46 +111,68 @@ # GFX12: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30] 0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_eq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt index b2539ad5a49e7..e55d6a4c77c2a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt @@ -27,10 +27,20 @@ # GFX12: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_eq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05] From d789915f35a976bb532441915249cd1b165c2fd5 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 24 Jan 2025 15:32:35 -0800 Subject: [PATCH 063/432] [Github] Bump Runner Version in Containers (#124324) This patch bumps the runner version to v2.322.0 in the CI containers. Nothing looks suspicious in the change log, and it is important to keep the runner up to date or we will end up with containers that cannot connect to Github due to having a version too old. --- .../workflows/containers/github-action-ci-windows/Dockerfile | 2 +- .github/workflows/containers/github-action-ci/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile index 2295e39d62c30..9a1fab694c9df 100644 --- a/.github/workflows/containers/github-action-ci-windows/Dockerfile +++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile @@ -108,7 +108,7 @@ RUN choco install -y handle RUN pip3 install pywin32 buildbot-worker==2.8.4 -ARG RUNNER_VERSION=2.321.0 +ARG RUNNER_VERSION=2.322.0 ENV RUNNER_VERSION=$RUNNER_VERSION RUN powershell -Command \ diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile index 35a0f1f6020dc..377b8f14402ee 100644 --- a/.github/workflows/containers/github-action-ci/Dockerfile +++ b/.github/workflows/containers/github-action-ci/Dockerfile @@ -96,7 +96,7 @@ WORKDIR /home/gha FROM ci-container as ci-container-agent -ENV GITHUB_RUNNER_VERSION=2.321.0 +ENV GITHUB_RUNNER_VERSION=2.322.0 RUN mkdir actions-runner && \ cd actions-runner && \ From 280c7d719834a828895b8a39f8ea982527fdcc73 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 24 Jan 2025 15:37:36 -0800 Subject: [PATCH 064/432] [CI] Increase Configurability of Monolithic Windows Build (#124328) This patch makes it so that the caller of monolithic-windows.sh can set the maximum number of parallel compile/link jobs in an environment variable rather than manually specifying it inside of the CMake. Additionally, the env variable definitions for CC, CXX, and LD are sunk into the shell script due to those config options being pretty inherent to what the pipeline is testing. This is intended to make things more flexible/useable for the new premerge CI pipeline, particularly as we are looking at using larger runners and want the increased flexibility to experiment. --- .ci/generate-buildkite-pipeline-premerge | 5 ++--- .ci/monolithic-windows.sh | 8 ++++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge index 9d9ca32183944..e547afaeb722f 100755 --- a/.ci/generate-buildkite-pipeline-premerge +++ b/.ci/generate-buildkite-pipeline-premerge @@ -128,9 +128,8 @@ if [[ "${windows_projects}" != "" ]]; then limit: 2 timeout_in_minutes: 150 env: - CC: 'cl' - CXX: 'cl' - LD: 'link' + MAX_PARALLEL_COMPILE_JOBS: '16' + MAX_PARALLEL_LINK_JOBS: '4' commands: - 'C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64' - 'bash .ci/monolithic-windows.sh "$(echo ${windows_projects} | tr ' ' ';')" "$(echo ${windows_check_targets})"' diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 68303a3ea153a..57b276f3e1df0 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -50,6 +50,10 @@ echo "--- cmake" pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt +export CC=cl +export CXX=cl +export LD=link + # The CMAKE_*_LINKER_FLAGS to disable the manifest come from research # on fixing a build reliability issue on the build server, please # see https://github.com/llvm/llvm-project/pull/82393 and @@ -72,8 +76,8 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \ -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \ -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \ - -D LLVM_PARALLEL_COMPILE_JOBS=16 \ - -D LLVM_PARALLEL_LINK_JOBS=4 + -D LLVM_PARALLEL_COMPILE_JOBS=${MAX_PARALLEL_COMPILE_JOBS} \ + -D LLVM_PARALLEL_LINK_JOBS=${MAX_PARALLEL_LINK_JOBS} echo "--- ninja" # Targets are not escaped as they are passed as separate arguments. From 6409799bdcd86be3ed72e8d172181294d3e5ad09 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 24 Jan 2025 15:39:37 -0800 Subject: [PATCH 065/432] [SandboxVec][Legality] Pack from different BBs (#124363) When the inputs of the pack come from different BBs we need to make sure we emit the pack instructions at the correct place. --- .../Vectorize/SandboxVectorizer/Legality.h | 3 ++ .../Vectorize/SandboxVectorizer/Legality.cpp | 5 ++++ .../test/Transforms/SandboxVectorizer/pack.ll | 27 +++++++++++++++++ .../SandboxVectorizer/LegalityTest.cpp | 29 +++++++++++++++++-- 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index f10c535aa820e..156b788d8a203 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -91,6 +91,7 @@ enum class ResultReason { DiffTypes, DiffMathFlags, DiffWrapFlags, + DiffBBs, NotConsecutive, CantSchedule, Unimplemented, @@ -127,6 +128,8 @@ struct ToStr { return "DiffMathFlags"; case ResultReason::DiffWrapFlags: return "DiffWrapFlags"; + case ResultReason::DiffBBs: + return "DiffBBs"; case ResultReason::NotConsecutive: return "NotConsecutive"; case ResultReason::CantSchedule: diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp index 085f4cd67ab76..48bc246e4b56a 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -214,6 +214,11 @@ const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef Bndl, dumpBndl(Bndl);); return createLegalityResult(ResultReason::NotInstructions); } + // Pack if not in the same BB. + auto *BB = cast(Bndl[0])->getParent(); + if (any_of(drop_begin(Bndl), + [BB](auto *V) { return cast(V)->getParent() != BB; })) + return createLegalityResult(ResultReason::DiffBBs); auto CollectDescrs = getHowToCollectValues(Bndl); if (CollectDescrs.hasVectorInputs()) { diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll index a0aa2a79a0ade..ec6e61a90c0fb 100644 --- a/llvm/test/Transforms/SandboxVectorizer/pack.ll +++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll @@ -88,3 +88,30 @@ loop: exit: ret void } + +define void @packFromDiffBBs(ptr %ptr, i8 %v) { +; CHECK-LABEL: define void @packFromDiffBBs( +; CHECK-SAME: ptr [[PTR:%.*]], i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ADD0:%.*]] = add i8 [[V]], 1 +; CHECK-NEXT: br label %[[BB:.*]] +; CHECK: [[BB]]: +; CHECK-NEXT: [[ADD1:%.*]] = add i8 [[V]], 2 +; CHECK-NEXT: [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[ADD0]], i32 0 +; CHECK-NEXT: [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[ADD1]], i32 1 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: store <2 x i8> [[PACK1]], ptr [[GEP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %add0 = add i8 %v, 1 + br label %bb + +bb: + %add1 = add i8 %v, 2 + %gep0 = getelementptr i8, ptr %ptr, i64 0 + %gep1 = getelementptr i8, ptr %ptr, i64 1 + store i8 %add0, ptr %gep0 + store i8 %add1, ptr %gep1 + ret void +} diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp index b421d08bc6b02..acc887f9dc6c1 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp @@ -57,11 +57,24 @@ struct LegalityTest : public testing::Test { } }; +static sandboxir::BasicBlock *getBasicBlockByName(sandboxir::Function *F, + StringRef Name) { + for (sandboxir::BasicBlock &BB : *F) + if (BB.getName() == Name) + return &BB; + llvm_unreachable("Expected to find basic block!"); +} + TEST_F(LegalityTest, LegalitySkipSchedule) { parseIR(C, R"IR( define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float %farg0, float %farg1, i64 %v0, i64 %v1, i32 %v2) { +entry: %gep0 = getelementptr float, ptr %ptr, i32 0 %gep1 = getelementptr float, ptr %ptr, i32 1 + store float %farg0, ptr %gep1 + br label %bb + +bb: %gep3 = getelementptr float, ptr %ptr, i32 3 %ld0 = load float, ptr %gep0 %ld0b = load float, ptr %gep0 @@ -89,10 +102,14 @@ define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float sandboxir::Context Ctx(C); auto *F = Ctx.createFunction(LLVMF); - auto *BB = &*F->begin(); - auto It = BB->begin(); + auto *EntryBB = getBasicBlockByName(F, "entry"); + auto It = EntryBB->begin(); [[maybe_unused]] auto *Gep0 = cast(&*It++); [[maybe_unused]] auto *Gep1 = cast(&*It++); + auto *St1Entry = cast(&*It++); + + auto *BB = getBasicBlockByName(F, "bb"); + It = BB->begin(); [[maybe_unused]] auto *Gep3 = cast(&*It++); auto *Ld0 = cast(&*It++); auto *Ld0b = cast(&*It++); @@ -162,6 +179,14 @@ define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float EXPECT_EQ(cast(Result).getReason(), sandboxir::ResultReason::DiffWrapFlags); } + { + // Check DiffBBs + const auto &Result = + Legality.canVectorize({St0, St1Entry}, /*SkipScheduling=*/true); + EXPECT_TRUE(isa(Result)); + EXPECT_EQ(cast(Result).getReason(), + sandboxir::ResultReason::DiffBBs); + } { // Check DiffTypes for unary operands that have a different type. const auto &Result = Legality.canVectorize({Trunc64to8, Trunc32to8}, From 48657bf29b01e95749b5ecd8c7f675c14a7948d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 24 Jan 2025 15:52:05 -0800 Subject: [PATCH 066/432] [flang][cuda] Handle launch of cooperative kernel (#124362) Add `CUFLaunchCooperativeKernel` entry points and lower gpu.launch_func with grid_global attribute to this entry point. --- flang/include/flang/Runtime/CUDA/kernel.h | 4 ++ .../Transforms/CUFGPUToLLVMConversion.cpp | 18 +++-- flang/runtime/CUDA/kernel.cpp | 65 +++++++++++++++++++ flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir | 35 ++++++++++ 4 files changed, 116 insertions(+), 6 deletions(-) diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h index 85afda09e347a..1f812b580327a 100644 --- a/flang/include/flang/Runtime/CUDA/kernel.h +++ b/flang/include/flang/Runtime/CUDA/kernel.h @@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, void **params, void **extra); +void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void **params, void **extra); + } // extern "C" #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_ diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 60aa401e1cc8c..c469b5a95b044 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -139,20 +139,26 @@ struct GPULaunchKernelConversion adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs, nullPtr}); } else { - auto funcOp = mod.lookupSymbol( - RTNAME_STRING(CUFLaunchKernel)); + auto procAttr = + op->getAttrOfType(cuf::getProcAttrName()); + bool isGridGlobal = + procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal; + llvm::StringRef fctName = isGridGlobal + ? RTNAME_STRING(CUFLaunchCooperativeKernel) + : RTNAME_STRING(CUFLaunchKernel); + auto funcOp = mod.lookupSymbol(fctName); auto funcTy = mlir::LLVM::LLVMFunctionType::get( voidTy, {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy}, /*isVarArg=*/false); - auto cufLaunchKernel = mlir::SymbolRefAttr::get( - mod.getContext(), RTNAME_STRING(CUFLaunchKernel)); + auto cufLaunchKernel = + mlir::SymbolRefAttr::get(mod.getContext(), fctName); if (!funcOp) { mlir::OpBuilder::InsertionGuard insertGuard(rewriter); rewriter.setInsertionPointToStart(mod.getBody()); - auto launchKernelFuncOp = rewriter.create( - loc, RTNAME_STRING(CUFLaunchKernel), funcTy); + auto launchKernelFuncOp = + rewriter.create(loc, fctName, funcTy); launchKernelFuncOp.setVisibility( mlir::SymbolTable::Visibility::Private); } diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp index bdc04ccb17672..02d89fb8423a5 100644 --- a/flang/runtime/CUDA/kernel.cpp +++ b/flang/runtime/CUDA/kernel.cpp @@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params)); } +void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void **params, void **extra) { + dim3 gridDim; + gridDim.x = gridX; + gridDim.y = gridY; + gridDim.z = gridZ; + dim3 blockDim; + blockDim.x = blockX; + blockDim.y = blockY; + blockDim.z = blockZ; + unsigned nbNegGridDim{0}; + if (gridX < 0) { + ++nbNegGridDim; + } + if (gridY < 0) { + ++nbNegGridDim; + } + if (gridZ < 0) { + ++nbNegGridDim; + } + if (nbNegGridDim == 1) { + int maxBlocks, nbBlocks, dev, multiProcCount; + cudaError_t err1, err2; + nbBlocks = blockDim.x * blockDim.y * blockDim.z; + cudaGetDevice(&dev); + err1 = cudaDeviceGetAttribute( + &multiProcCount, cudaDevAttrMultiProcessorCount, dev); + err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &maxBlocks, kernel, nbBlocks, smem); + if (err1 == cudaSuccess && err2 == cudaSuccess) { + maxBlocks = multiProcCount * maxBlocks; + } + if (maxBlocks > 0) { + if (gridX > 0) { + maxBlocks = maxBlocks / gridDim.x; + } + if (gridY > 0) { + maxBlocks = maxBlocks / gridDim.y; + } + if (gridZ > 0) { + maxBlocks = maxBlocks / gridDim.z; + } + if (maxBlocks < 1) { + maxBlocks = 1; + } + if (gridX < 0) { + gridDim.x = maxBlocks; + } + if (gridY < 0) { + gridDim.y = maxBlocks; + } + if (gridZ < 0) { + gridDim.z = maxBlocks; + } + } + } else if (nbNegGridDim > 1) { + Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Too many invalid grid dimensions"); + } + cudaStream_t stream = 0; // TODO stream managment + CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel( + kernel, gridDim, blockDim, params, smem, stream)); +} + } // extern "C" diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir index 3db2336c90a7d..0827e378c7c07 100644 --- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir +++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir @@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, d // CHECK-LABEL: llvm.func @_QQmain() // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}}) + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + llvm.func @_QMmod1Phost_sub() { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %2 = llvm.mlir.constant(40 : i64) : i64 + %3 = llvm.mlir.constant(16 : i32) : i32 + %4 = llvm.mlir.constant(25 : i32) : i32 + %5 = llvm.mlir.constant(21 : i32) : i32 + %6 = llvm.mlir.constant(17 : i32) : i32 + %7 = llvm.mlir.constant(1 : index) : i64 + %8 = llvm.mlir.constant(27 : i32) : i32 + %9 = llvm.mlir.constant(6 : i32) : i32 + %10 = llvm.mlir.constant(1 : i32) : i32 + %11 = llvm.mlir.constant(0 : i32) : i32 + %12 = llvm.mlir.constant(10 : index) : i64 + %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr + %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr + gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc} + llvm.return + } + llvm.func @_QMmod1Psub1(!llvm.ptr) -> () + llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> { + %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8> + llvm.return %0 : !llvm.array<2 x i8> + } + llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"} + llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"} + gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">] +} + +// CHECK-LABEL: llvm.func @_QMmod1Phost_sub() +// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel From c725a95e088dea14953c2d891d04429bc50b912e Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 24 Jan 2025 15:58:13 -0800 Subject: [PATCH 067/432] [MemProf] Convert Hot contexts to NotCold early (#124219) While we convert hot contexts to notcold contexts during the cloning step, their existence was greatly limiting the context trimming performed when we add the MemProf profile to the IR. To address this, any hot contexts are converted to notcold contexts immediately after first checking for unambiguous allocation types, and before checking it again and before adding metadata while performing context trimming. Note that hot hints are now disabled by default, however, this avoids adding unnecessary overhead if they are re-enabled. --- .../include/llvm/Analysis/MemoryProfileInfo.h | 14 ++++ llvm/lib/Analysis/MemoryProfileInfo.cpp | 29 +++++++- llvm/test/Transforms/PGOProfile/memprof.ll | 9 --- .../Analysis/MemoryProfileInfoTest.cpp | 73 ++++++------------- 4 files changed, 62 insertions(+), 63 deletions(-) diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h index 215139caef696..deb7ab134c161 100644 --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -65,6 +65,15 @@ class CallStackTrie { std::map Callers; CallStackTrieNode(AllocationType Type) : AllocTypes(static_cast(Type)) {} + void addAllocType(AllocationType AllocType) { + AllocTypes |= static_cast(AllocType); + } + void removeAllocType(AllocationType AllocType) { + AllocTypes &= ~static_cast(AllocType); + } + bool hasAllocType(AllocationType AllocType) const { + return AllocTypes & static_cast(AllocType); + } }; // The node for the allocation at the root. @@ -85,6 +94,11 @@ class CallStackTrie { void collectContextSizeInfo(CallStackTrieNode *Node, std::vector &ContextSizeInfo); + // Recursively convert hot allocation types to notcold, since we don't + // actually do any cloning for hot contexts, to facilitate more aggressive + // pruning of contexts. + void convertHotToNotCold(CallStackTrieNode *Node); + // Recursive helper to trim contexts and create metadata nodes. bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector &MIBCallStack, diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 52f4adbdb0429..5553a2e2dd24b 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -147,7 +147,7 @@ void CallStackTrie::addCallStack( First = false; if (Alloc) { assert(AllocStackId == StackId); - Alloc->AllocTypes |= static_cast(AllocType); + Alloc->addAllocType(AllocType); } else { AllocStackId = StackId; Alloc = new CallStackTrieNode(AllocType); @@ -159,7 +159,7 @@ void CallStackTrie::addCallStack( auto Next = Curr->Callers.find(StackId); if (Next != Curr->Callers.end()) { Curr = Next->second; - Curr->AllocTypes |= static_cast(AllocType); + Curr->addAllocType(AllocType); continue; } // Otherwise add a new caller node. @@ -228,6 +228,15 @@ void CallStackTrie::collectContextSizeInfo( collectContextSizeInfo(Caller.second, ContextSizeInfo); } +void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) { + if (Node->hasAllocType(AllocationType::Hot)) { + Node->removeAllocType(AllocationType::Hot); + Node->addAllocType(AllocationType::NotCold); + } + for (auto &Caller : Node->Callers) + convertHotToNotCold(Caller.second); +} + // Recursive helper to trim contexts and create metadata nodes. // Caller should have pushed Node's loc to MIBCallStack. Doing this in the // caller makes it simpler to handle the many early returns in this method. @@ -307,6 +316,22 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) { "single"); return false; } + // If there were any hot allocation contexts, the Alloc trie node would have + // the Hot type set. If so, because we don't currently support cloning for hot + // contexts, they should be converted to NotCold. This happens in the cloning + // support anyway, however, doing this now enables more aggressive context + // trimming when building the MIB metadata (and possibly may make the + // allocation have a single NotCold allocation type), greatly reducing + // overheads in bitcode, cloning memory and cloning time. + if (Alloc->hasAllocType(AllocationType::Hot)) { + convertHotToNotCold(Alloc); + // Check whether we now have a single alloc type. + if (hasSingleAllocType(Alloc->AllocTypes)) { + addSingleAllocTypeAttribute(CI, (AllocationType)Alloc->AllocTypes, + "single"); + return false; + } + } auto &Ctx = CI->getContext(); std::vector MIBCallStack; MIBCallStack.push_back(AllocStackId); diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll index 367069e993fe1..6aa2d307a1dc8 100644 --- a/llvm/test/Transforms/PGOProfile/memprof.ll +++ b/llvm/test/Transforms/PGOProfile/memprof.ll @@ -84,8 +84,6 @@ ; RUN: llvm-profdata merge -memprof-random-hotness -memprof-random-hotness-seed=1730170724 %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand2 2>&1 | FileCheck %s --check-prefix=RAND2 ; RAND2: random hotness seed = 1730170724 ; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2,ALL,MEMPROFONLY,MEMPROFSTATS -;; Check with hot hints enabled -; RUN: opt < %s -memprof-use-hot-hints -passes='memprof-use' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2HOT ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched @@ -413,13 +411,6 @@ for.end: ; preds = %for.cond ; MEMPROFRAND2: !"notcold" ; MEMPROFRAND2: !"notcold" -;; With hot hints enabled the last 2 should be hot. -; MEMPROFRAND2HOT: !"cold" -; MEMPROFRAND2HOT: !"cold" -; MEMPROFRAND2HOT: !"cold" -; MEMPROFRAND2HOT: !"hot" -; MEMPROFRAND2HOT: !"hot" - ; MEMPROFSTATS: 8 memprof - Number of alloc contexts in memory profile. ; MEMPROFSTATS: 10 memprof - Number of callsites in memory profile. ; MEMPROFSTATS: 6 memprof - Number of functions having valid memory profile. diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp index 3888faf5453d3..b4e81e69116e8 100644 --- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp +++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp @@ -165,6 +165,8 @@ define i32* @test() { %1 = bitcast i8* %call2 to i32* %call3 = call noalias dereferenceable_or_null(40) i8* @malloc(i64 noundef 40) %2 = bitcast i8* %call3 to i32* + %call4 = call noalias dereferenceable_or_null(40) i8* @malloc(i64 noundef 40) + %3 = bitcast i8* %call4 to i32* ret i32* %1 } declare dso_local noalias noundef i8* @malloc(i64 noundef) @@ -204,6 +206,18 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) EXPECT_FALSE(Call3->hasMetadata(LLVMContext::MD_memprof)); EXPECT_TRUE(Call3->hasFnAttr("memprof")); EXPECT_EQ(Call3->getFnAttr("memprof").getValueAsString(), "hot"); + + // Fourth call has hot and non-cold contexts. These should be treated as + // notcold and given a notcold attribute. + CallStackTrie Trie4; + Trie4.addCallStack(AllocationType::Hot, {5, 6}); + Trie4.addCallStack(AllocationType::NotCold, {5, 7, 8}); + CallBase *Call4 = findCall(*Func, "call4"); + Trie4.buildAndAttachMIBMetadata(Call4); + + EXPECT_FALSE(Call4->hasMetadata(LLVMContext::MD_memprof)); + EXPECT_TRUE(Call4->hasFnAttr("memprof")); + EXPECT_EQ(Call4->getFnAttr("memprof").getValueAsString(), "notcold"); } // Test CallStackTrie::addCallStack interface taking allocation type and list of @@ -299,56 +313,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Cold); else { ASSERT_EQ(StackId->getZExtValue(), 3u); - EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot); - } - } -} - -// Test CallStackTrie::addCallStack interface taking allocation type and list of -// call stack ids. -// Test that an allocation call reached by both non cold and hot call stacks -// gets memprof metadata representing the different allocation type contexts. -TEST_F(MemoryProfileInfoTest, NotColdAndHotMIB) { - LLVMContext C; - std::unique_ptr M = makeLLVMModule(C, - R"IR( -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-pc-linux-gnu" -define i32* @test() { -entry: - %call = call noalias dereferenceable_or_null(40) i8* @malloc(i64 noundef 40) - %0 = bitcast i8* %call to i32* - ret i32* %0 -} -declare dso_local noalias noundef i8* @malloc(i64 noundef) -)IR"); - - Function *Func = M->getFunction("test"); - - CallStackTrie Trie; - Trie.addCallStack(AllocationType::NotCold, {1, 2}); - Trie.addCallStack(AllocationType::Hot, {1, 3}); - - CallBase *Call = findCall(*Func, "call"); - Trie.buildAndAttachMIBMetadata(Call); - - EXPECT_FALSE(Call->hasFnAttr("memprof")); - EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof)); - MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); - ASSERT_EQ(MemProfMD->getNumOperands(), 2u); - for (auto &MIBOp : MemProfMD->operands()) { - MDNode *MIB = dyn_cast(MIBOp); - MDNode *StackMD = getMIBStackNode(MIB); - ASSERT_NE(StackMD, nullptr); - ASSERT_EQ(StackMD->getNumOperands(), 2u); - auto *StackId = mdconst::dyn_extract(StackMD->getOperand(0)); - ASSERT_EQ(StackId->getZExtValue(), 1u); - StackId = mdconst::dyn_extract(StackMD->getOperand(1)); - if (StackId->getZExtValue() == 2u) + // Hot contexts are converted to NotCold when building the metadata. EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); - else { - ASSERT_EQ(StackId->getZExtValue(), 3u); - EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot); } } } @@ -401,7 +367,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); } else { ASSERT_EQ(StackId->getZExtValue(), 4u); - EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot); + // Hot contexts are converted to NotCold when building the metadata. + EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); } } } @@ -463,7 +430,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); else { ASSERT_EQ(StackId->getZExtValue(), 8u); - EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot); + // Hot contexts are converted to NotCold when building the metadata. + EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); } } } @@ -606,7 +574,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef) EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); else { ASSERT_EQ(StackId->getZExtValue(), 8u); - EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot); + // Hot contexts are converted to NotCold when building the new metadata. + EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold); } } } From db1ee18eda6329d7577ad019a47822220b3e293d Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 24 Jan 2025 15:56:07 -0800 Subject: [PATCH 068/432] NFC: Typo fix Change-Id: I08470bc617490558250136ea35a4964003fa9981 --- llvm/docs/AMDGPUUsage.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 8f09df2406f10..71f11bf89368f 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1327,7 +1327,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. with the fifth i32 operand. The i1 sixth operand is used to clamp the output. The i1s preceding the vector operands decide the signedness. - llvm.amdgcn.sched_barrier Controls the types of instructions that may be allowed to cross the intrinsic + llvm.amdgcn.sched.barrier Controls the types of instructions that may be allowed to cross the intrinsic during instruction scheduling. The parameter is a mask for the instruction types that can cross the intrinsic. @@ -1345,7 +1345,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. - 0x0200: All DS write instructions may be scheduled across sched_barrier. - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier. - llvm.amdgcn.sched_group_barrier Creates schedule groups with specific properties to create custom scheduling + llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling pipelines. The ordering between groups is enforced by the instruction scheduler. The intrinsic applies to the code that preceeds the intrinsic. The intrinsic takes three values that control the behavior of the schedule groups. @@ -1369,7 +1369,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. | ``// 5 MFMA`` | ``__builtin_amdgcn_sched_group_barrier(8, 5, 0)`` - llvm.amdgcn.iglp_opt An **experimental** intrinsic for instruction group level parallelism. The intrinsic + llvm.amdgcn.iglp.opt An **experimental** intrinsic for instruction group level parallelism. The intrinsic implements predefined intruction scheduling orderings. The intrinsic applies to the surrounding scheduling region. The intrinsic takes a value that specifies the strategy. The compiler implements two strategies. From 1b4bd4e1a5120c8bb4daa44787a3bc4559b6b3b4 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 24 Jan 2025 16:43:02 -0800 Subject: [PATCH 069/432] [BOLT][AArch64] Remove assertions from jump table heuristic (#124372) The code for jump table detection on AArch64 asserts liberally whenever the input instruction sequence does not match the expected pattern. As a result, BOLT fails to process binaries with such sequences instead of ignoring functions with unknown control flow. Remove asserts in analyzeIndirectBranchFragment() and mark indirect jumps as instructions with unknown control flow instead. --- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 45 ++++++++++--------- bolt/test/AArch64/jump-table-heuristic-fail.s | 29 ++++++++++++ bolt/test/AArch64/test-indirect-branch.s | 9 ++-- 3 files changed, 58 insertions(+), 25 deletions(-) create mode 100644 bolt/test/AArch64/jump-table-heuristic-fail.s diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index ac709c5dd063a..4b21ff719b3ab 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -834,6 +834,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { /// # of this BB) /// br x0 # Indirect jump instruction /// + /// Return true on successful jump table instruction sequence match, false + /// otherwise. bool analyzeIndirectBranchFragment( const MCInst &Inst, DenseMap> &UDChain, @@ -842,6 +844,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // Expect AArch64 BR assert(Inst.getOpcode() == AArch64::BR && "Unexpected opcode"); + JumpTable = nullptr; + // Match the indirect branch pattern for aarch64 SmallVector &UsesRoot = UDChain[&Inst]; if (UsesRoot.size() == 0 || UsesRoot[0] == nullptr) @@ -879,8 +883,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // Parsed as ADDXrs reg:x8 reg:x8 reg:x12 imm:0 return false; } - assert(DefAdd->getOpcode() == AArch64::ADDXrx && - "Failed to match indirect branch!"); + if (DefAdd->getOpcode() != AArch64::ADDXrx) + return false; // Validate ADD operands int64_t OperandExtension = DefAdd->getOperand(3).getImm(); @@ -897,8 +901,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // ldr w7, [x6] // add x6, x6, w7, sxtw => no shift amount // br x6 - errs() << "BOLT-WARNING: " - "Failed to match indirect branch: ShiftVAL != 2 \n"; + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " + "failed to match indirect branch: ShiftVAL != 2\n"); return false; } @@ -909,7 +913,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { else if (ExtendType == AArch64_AM::SXTW) ScaleValue = 4LL; else - llvm_unreachable("Failed to match indirect branch! (fragment 3)"); + return false; // Match an ADR to load base address to be used when addressing JT targets SmallVector &UsesAdd = UDChain[DefAdd]; @@ -920,18 +924,15 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return false; } MCInst *DefBaseAddr = UsesAdd[1]; - assert(DefBaseAddr->getOpcode() == AArch64::ADR && - "Failed to match indirect branch pattern! (fragment 3)"); + if (DefBaseAddr->getOpcode() != AArch64::ADR) + return false; PCRelBase = DefBaseAddr; // Match LOAD to load the jump table (relative) target const MCInst *DefLoad = UsesAdd[2]; - assert(mayLoad(*DefLoad) && - "Failed to match indirect branch load pattern! (1)"); - assert((ScaleValue != 1LL || isLDRB(*DefLoad)) && - "Failed to match indirect branch load pattern! (2)"); - assert((ScaleValue != 2LL || isLDRH(*DefLoad)) && - "Failed to match indirect branch load pattern! (3)"); + if (!mayLoad(*DefLoad) || (ScaleValue == 1LL && !isLDRB(*DefLoad)) || + (ScaleValue == 2LL && !isLDRH(*DefLoad))) + return false; // Match ADD that calculates the JumpTable Base Address (not the offset) SmallVector &UsesLoad = UDChain[DefLoad]; @@ -941,7 +942,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { isRegToRegMove(*DefJTBaseAdd, From, To)) { // Sometimes base address may have been defined in another basic block // (hoisted). Return with no jump table info. - JumpTable = nullptr; return true; } @@ -953,24 +953,27 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // adr x12, 0x247b30 <__gettextparse+0x5b0> // add x13, x12, w13, sxth #2 // br x13 - errs() << "BOLT-WARNING: Failed to match indirect branch: " - "nop/adr instead of adrp/add \n"; + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: failed to match indirect branch: " + "nop/adr instead of adrp/add\n"); return false; } - assert(DefJTBaseAdd->getOpcode() == AArch64::ADDXri && - "Failed to match jump table base address pattern! (1)"); + if (DefJTBaseAdd->getOpcode() != AArch64::ADDXri) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: failed to match jump table base " + "address pattern! (1)\n"); + return false; + } if (DefJTBaseAdd->getOperand(2).isImm()) Offset = DefJTBaseAdd->getOperand(2).getImm(); SmallVector &UsesJTBaseAdd = UDChain[DefJTBaseAdd]; const MCInst *DefJTBasePage = UsesJTBaseAdd[1]; if (DefJTBasePage == nullptr || isLoadFromStack(*DefJTBasePage)) { - JumpTable = nullptr; return true; } - assert(DefJTBasePage->getOpcode() == AArch64::ADRP && - "Failed to match jump table base page pattern! (2)"); + if (DefJTBasePage->getOpcode() != AArch64::ADRP) + return false; + if (DefJTBasePage->getOperand(1).isExpr()) JumpTable = DefJTBasePage->getOperand(1).getExpr(); return true; diff --git a/bolt/test/AArch64/jump-table-heuristic-fail.s b/bolt/test/AArch64/jump-table-heuristic-fail.s new file mode 100644 index 0000000000000..724171ac39925 --- /dev/null +++ b/bolt/test/AArch64/jump-table-heuristic-fail.s @@ -0,0 +1,29 @@ +## Verify that BOLT does not crash while encountering instruction sequence that +## does not perfectly match jump table pattern. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags --target=aarch64-unknown-linux %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg 2>&1 | FileCheck %s + + .section .text + .align 4 + .globl _start + .type _start, %function +_start: + sub w0, w0, #0x4a +## The address loaded into x22 is undefined. However, the instructions that +## follow ldr, use the x22 address as a regular jump table. + ldr x22, [x29, #0x98] + ldrb w0, [x22, w0, uxtw] + adr x1, #12 + add x0, x1, w0, sxtb #2 + br x0 +# CHECK: br x0 # UNKNOWN +.L0: + ret +.size _start, .-_start + +## Force relocation mode. + .reloc 0, R_AARCH64_NONE diff --git a/bolt/test/AArch64/test-indirect-branch.s b/bolt/test/AArch64/test-indirect-branch.s index 168e50c8f47f5..1e16e76b11530 100644 --- a/bolt/test/AArch64/test-indirect-branch.s +++ b/bolt/test/AArch64/test-indirect-branch.s @@ -3,10 +3,11 @@ // clang-format off -// REQUIRES: system-linux +// REQUIRES: system-linux, asserts + // RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o // RUN: %clang %cflags --target=aarch64-unknown-linux %t.o -o %t.exe -Wl,-q -// RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg --strict\ +// RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg --strict --debug-only=mcplus \ // RUN: -v=1 2>&1 | FileCheck %s // Pattern 1: there is no shift amount after the 'add' instruction. @@ -39,7 +40,7 @@ _start: // svc #0 // Pattern 1 -// CHECK: BOLT-WARNING: Failed to match indirect branch: ShiftVAL != 2 +// CHECK: BOLT-DEBUG: failed to match indirect branch: ShiftVAL != 2 .globl test1 .type test1, %function test1: @@ -57,7 +58,7 @@ test1_2: ret // Pattern 2 -// CHECK: BOLT-WARNING: Failed to match indirect branch: nop/adr instead of adrp/add +// CHECK: BOLT-DEBUG: failed to match indirect branch: nop/adr instead of adrp/add .globl test2 .type test2, %function test2: From d92bac8a3ebb19106f6bca6b7613a27c52cb48ab Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Fri, 24 Jan 2025 16:48:35 -0800 Subject: [PATCH 070/432] [HLSL] Introduce address space `hlsl_constant(2)` for constant buffer declarations (#123411) Introduces a new address space `hlsl_constant(2)` for constant buffer declarations. This address space is applied to declarations inside `cbuffer` block. Later on, it will also be applied to `ConstantBuffer` syntax and the default `$Globals` constant buffer. Clang codegen translates constant buffer declarations to global variables and loads from `hlsl_constant(2)` address space. More work coming soon will include addition of metadata that will map these globals to individual constant buffers and enable their transformation to appropriate constant buffer load intrinsics later on in an LLVM pass. Fixes #123406 --- clang/include/clang/Basic/AddressSpaces.h | 1 + clang/lib/AST/TypePrinter.cpp | 6 +- clang/lib/Basic/Targets/AArch64.h | 1 + clang/lib/Basic/Targets/AMDGPU.cpp | 21 ++++--- clang/lib/Basic/Targets/DirectX.h | 1 + clang/lib/Basic/Targets/NVPTX.h | 1 + clang/lib/Basic/Targets/SPIR.h | 2 + clang/lib/Basic/Targets/SystemZ.h | 1 + clang/lib/Basic/Targets/TCE.h | 1 + clang/lib/Basic/Targets/WebAssembly.h | 1 + clang/lib/Basic/Targets/X86.h | 1 + clang/lib/CodeGen/CGHLSLRuntime.cpp | 17 +---- clang/lib/Sema/SemaHLSL.cpp | 13 +++- .../ast-dump-comment-cbuffer-tbuffer.hlsl | 62 ------------------- .../AST/HLSL/ast-dump-comment-cbuffer.hlsl | 32 ++++++++++ clang/test/AST/HLSL/cbuffer.hlsl | 24 +++---- .../test/AST/HLSL/cbuffer_and_namespaces.hlsl | 14 ++--- clang/test/AST/HLSL/packoffset.hlsl | 38 ++++++------ clang/test/AST/HLSL/pch_hlsl_buffer.hlsl | 8 +-- .../test/AST/HLSL/resource_binding_attr.hlsl | 8 +-- clang/test/CodeGenHLSL/cbuf.hlsl | 19 ++++-- clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl | 14 +++-- .../static_global_and_function_in_cb.hlsl | 15 +++-- .../SemaTemplate/address_space-dependent.cpp | 2 +- 24 files changed, 148 insertions(+), 155 deletions(-) delete mode 100644 clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl create mode 100644 clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl diff --git a/clang/include/clang/Basic/AddressSpaces.h b/clang/include/clang/Basic/AddressSpaces.h index 7b723d508fff1..d18bfe54931f9 100644 --- a/clang/include/clang/Basic/AddressSpaces.h +++ b/clang/include/clang/Basic/AddressSpaces.h @@ -58,6 +58,7 @@ enum class LangAS : unsigned { // HLSL specific address spaces. hlsl_groupshared, + hlsl_constant, // Wasm specific address spaces. wasm_funcref, diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index a850410ffc846..31695374cb52b 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -2552,10 +2552,12 @@ std::string Qualifiers::getAddrSpaceAsString(LangAS AS) { return "__uptr __ptr32"; case LangAS::ptr64: return "__ptr64"; - case LangAS::wasm_funcref: - return "__funcref"; case LangAS::hlsl_groupshared: return "groupshared"; + case LangAS::hlsl_constant: + return "hlsl_constant"; + case LangAS::wasm_funcref: + return "__funcref"; default: return std::to_string(toTargetAddressSpace(AS)); } diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index ecf80b23a508c..600940f5e4e23 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -44,6 +44,7 @@ static const unsigned ARM64AddrSpaceMap[] = { static_cast(AArch64AddrSpace::ptr32_uptr), static_cast(AArch64AddrSpace::ptr64), 0, // hlsl_groupshared + 0, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 99f8f2944e279..0d308cb6af969 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -59,6 +59,7 @@ const LangASMap AMDGPUTargetInfo::AMDGPUDefIsGenMap = { llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_uptr llvm::AMDGPUAS::FLAT_ADDRESS, // ptr64 llvm::AMDGPUAS::FLAT_ADDRESS, // hlsl_groupshared + llvm::AMDGPUAS::CONSTANT_ADDRESS, // hlsl_constant }; const LangASMap AMDGPUTargetInfo::AMDGPUDefIsPrivMap = { @@ -74,16 +75,16 @@ const LangASMap AMDGPUTargetInfo::AMDGPUDefIsPrivMap = { llvm::AMDGPUAS::CONSTANT_ADDRESS, // cuda_constant llvm::AMDGPUAS::LOCAL_ADDRESS, // cuda_shared // SYCL address space values for this map are dummy - llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global - llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global_device - llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global_host - llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_local - llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_private - llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_sptr - llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_uptr - llvm::AMDGPUAS::FLAT_ADDRESS, // ptr64 - llvm::AMDGPUAS::FLAT_ADDRESS, // hlsl_groupshared - + llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global + llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global_device + llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global_host + llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_local + llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_private + llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_sptr + llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_uptr + llvm::AMDGPUAS::FLAT_ADDRESS, // ptr64 + llvm::AMDGPUAS::FLAT_ADDRESS, // hlsl_groupshared + llvm::AMDGPUAS::CONSTANT_ADDRESS, // hlsl_constant }; } // namespace targets } // namespace clang diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h index ab22d1281a4df..4e6bc0e040398 100644 --- a/clang/lib/Basic/Targets/DirectX.h +++ b/clang/lib/Basic/Targets/DirectX.h @@ -42,6 +42,7 @@ static const unsigned DirectXAddrSpaceMap[] = { 0, // ptr32_uptr 0, // ptr64 3, // hlsl_groupshared + 2, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index d81b89a7f24ac..c6531148fe30c 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -46,6 +46,7 @@ static const unsigned NVPTXAddrSpaceMap[] = { 0, // ptr32_uptr 0, // ptr64 0, // hlsl_groupshared + 0, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 5a328b9ceeb1d..c0849b69dcdb3 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -47,6 +47,7 @@ static const unsigned SPIRDefIsPrivMap[] = { 0, // ptr32_uptr 0, // ptr64 0, // hlsl_groupshared + 2, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref @@ -80,6 +81,7 @@ static const unsigned SPIRDefIsGenMap[] = { 0, // ptr32_uptr 0, // ptr64 0, // hlsl_groupshared + 0, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h index d05948586c467..bd2827cf13a5b 100644 --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -42,6 +42,7 @@ static const unsigned ZOSAddressMap[] = { 1, // ptr32_uptr 0, // ptr64 0, // hlsl_groupshared + 0, // hlsl_constant 0 // wasm_funcref }; diff --git a/clang/lib/Basic/Targets/TCE.h b/clang/lib/Basic/Targets/TCE.h index d6280b02f07b2..edec30bf69de0 100644 --- a/clang/lib/Basic/Targets/TCE.h +++ b/clang/lib/Basic/Targets/TCE.h @@ -51,6 +51,7 @@ static const unsigned TCEOpenCLAddrSpaceMap[] = { 0, // ptr32_uptr 0, // ptr64 0, // hlsl_groupshared + 0, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h index 0a14da6a277b8..cfecc59ac75fd 100644 --- a/clang/lib/Basic/Targets/WebAssembly.h +++ b/clang/lib/Basic/Targets/WebAssembly.h @@ -42,6 +42,7 @@ static const unsigned WebAssemblyAddrSpaceMap[] = { 0, // ptr32_uptr 0, // ptr64 0, // hlsl_groupshared + 0, // hlsl_constant 20, // wasm_funcref }; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 2c200e64a3d84..8bd54e362526f 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -46,6 +46,7 @@ static const unsigned X86AddrSpaceMap[] = { 271, // ptr32_uptr 272, // ptr64 0, // hlsl_groupshared + 0, // hlsl_constant // Wasm address space values for this target are dummy values, // as it is only enabled for Wasm targets. 20, // wasm_funcref diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 345e218f42451..2ce54cc3c52ef 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -100,22 +100,6 @@ GlobalVariable *replaceBuffer(CGHLSLRuntime::Buffer &Buf) { llvm::formatv("{0}{1}", Buf.Name, Buf.IsCBuffer ? ".cb." : ".tb."), GlobalValue::NotThreadLocal); - IRBuilder<> B(CBGV->getContext()); - Value *ZeroIdx = B.getInt32(0); - // Replace Const use with CB use. - for (auto &[GV, Offset] : Buf.Constants) { - Value *GEP = - B.CreateGEP(Buf.LayoutStruct, CBGV, {ZeroIdx, B.getInt32(Offset)}); - - assert(Buf.LayoutStruct->getElementType(Offset) == GV->getValueType() && - "constant type mismatch"); - - // Replace. - GV->replaceAllUsesWith(GEP); - // Erase GV. - GV->removeDeadConstantUsers(); - GV->eraseFromParent(); - } return CBGV; } @@ -144,6 +128,7 @@ void CGHLSLRuntime::addConstant(VarDecl *D, Buffer &CB) { } auto *GV = cast(CGM.GetAddrOfGlobalVar(D)); + GV->setExternallyInitialized(true); // Add debug info for constVal. if (CGDebugInfo *DI = CGM.getModuleDebugInfo()) if (CGM.getCodeGenOpts().getDebugInfo() >= diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index f26469e6a2f1d..a7033cb54886a 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -25,6 +25,7 @@ #include "clang/Basic/IdentifierTable.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Basic/Specifiers.h" #include "clang/Basic/TargetInfo.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/ParsedAttr.h" @@ -475,14 +476,20 @@ void createHostLayoutStructForBuffer(Sema &S, HLSLBufferDecl *BufDecl) { LS->setImplicit(true); LS->startDefinition(); - for (const Decl *D : BufDecl->decls()) { - const VarDecl *VD = dyn_cast(D); + for (Decl *D : BufDecl->decls()) { + VarDecl *VD = dyn_cast(D); if (!VD || VD->getStorageClass() == SC_Static) continue; const Type *Ty = VD->getType()->getUnqualifiedDesugaredType(); if (FieldDecl *FD = - createFieldForHostLayoutStruct(S, Ty, VD->getIdentifier(), LS)) + createFieldForHostLayoutStruct(S, Ty, VD->getIdentifier(), LS)) { + // add the field decl to the layout struct LS->addDecl(FD); + // update address space of the original decl to hlsl_constant + QualType NewTy = + AST.getAddrSpaceQualType(VD->getType(), LangAS::hlsl_constant); + VD->setType(NewTy); + } } LS->completeDefinition(); BufDecl->addDecl(LS); diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl deleted file mode 100644 index 0bff3ae144037..0000000000000 --- a/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl +++ /dev/null @@ -1,62 +0,0 @@ -// RUN: %clang_cc1 -Wdocumentation -ast-dump=json -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=JSON -// RUN: %clang_cc1 -Wdocumentation -ast-dump -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=AST - -// JSON:"kind": "HLSLBufferDecl", -// JSON:"name": "A", -// JSON-NEXT:"bufferKind": "cbuffer", -// JSON:"kind": "TextComment", -// JSON:"text": " CBuffer decl." - -/// CBuffer decl. -cbuffer A { - // JSON: "kind": "VarDecl", - // JSON: "name": "a", - // JSON: "qualType": "float" - float a; - // JSON: "kind": "VarDecl", - // JSON: "name": "b", - // JSON: "qualType": "int" - int b; -} - -// JSON:"kind": "HLSLBufferDecl", -// JSON:"name": "B", -// JSON-NEXT:"bufferKind": "tbuffer", -// JSON:"kind": "TextComment", -// JSON:"text": " TBuffer decl." - -/// TBuffer decl. -tbuffer B { - // JSON: "kind": "VarDecl", - // JSON: "name": "c", - // JSON: "qualType": "float" - float c; - // JSON: "kind": "VarDecl", - // JSON: "name": "d", - // JSON: "qualType": "int" - int d; -} - -// AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A -// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer -// AST-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer -// AST-NEXT: FullComment -// AST-NEXT: ParagraphComment -// AST-NEXT: TextComment {{.*}} Text=" CBuffer decl." -// AST-NEXT: VarDecl {{.*}} a 'float' -// AST-NEXT: VarDecl {{.*}} b 'int' -// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_A definition -// AST: FieldDecl {{.*}} a 'float' -// AST-NEXT: FieldDecl {{.*}} b 'int' - -// AST-NEXT: HLSLBufferDecl {{.*}} line:29:9 tbuffer B -// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV -// AST-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer -// AST-NEXT: FullComment -// AST-NEXT: ParagraphComment -// AST-NEXT: TextComment {{.*}} Text=" TBuffer decl." -// AST-NEXT: VarDecl {{.*}} c 'float' -// AST-NEXT: VarDecl {{.*}} d 'int' -// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_B definition -// AST: FieldDecl {{.*}} c 'float' -// AST-NEXT: FieldDecl {{.*}} d 'int' diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl new file mode 100644 index 0000000000000..b2b3e13308da3 --- /dev/null +++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 -Wdocumentation -ast-dump=json -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=JSON +// RUN: %clang_cc1 -Wdocumentation -ast-dump -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=AST + +// JSON:"kind": "HLSLBufferDecl", +// JSON:"name": "A", +// JSON-NEXT:"bufferKind": "cbuffer", +// JSON:"kind": "TextComment", +// JSON:"text": " CBuffer decl." + +/// CBuffer decl. +cbuffer A { + // JSON: "kind": "VarDecl", + // JSON: "name": "a", + // JSON: "qualType": "hlsl_constant float" + float a; + // JSON: "kind": "VarDecl", + // JSON: "name": "b", + // JSON: "qualType": "hlsl_constant int" + int b; +} + +// AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A +// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// AST-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer +// AST-NEXT: FullComment +// AST-NEXT: ParagraphComment +// AST-NEXT: TextComment {{.*}} Text=" CBuffer decl." +// AST-NEXT: VarDecl {{.*}} a 'hlsl_constant float' +// AST-NEXT: VarDecl {{.*}} b 'hlsl_constant int' +// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_A definition +// AST: FieldDecl {{.*}} a 'float' +// AST-NEXT: FieldDecl {{.*}} b 'int' diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl index 721abb290f163..f516cf5099e82 100644 --- a/clang/test/AST/HLSL/cbuffer.hlsl +++ b/clang/test/AST/HLSL/cbuffer.hlsl @@ -48,7 +48,7 @@ struct TwoFloats { // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB { - // CHECK: VarDecl {{.*}} col:9 used a1 'float' + // CHECK: VarDecl {{.*}} col:9 used a1 'hlsl_constant float' float a1; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB definition // CHECK: FieldDecl {{.*}} a1 'float' @@ -60,7 +60,7 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB { - // CHECK: VarDecl {{.*}} col:9 used a2 'float' + // CHECK: VarDecl {{.*}} col:9 used a2 'hlsl_constant float' float a2; // CHECK: VarDecl {{.*}} col:19 b2 'RWBuffer':'hlsl::RWBuffer' RWBuffer b2; @@ -68,7 +68,7 @@ cbuffer CB { EmptyStruct c2; // CHECK: VarDecl {{.*}} col:9 d2 'float[0]' float d2[0]; - // CHECK: VarDecl {{.*}} col:9 e2 'float' + // CHECK: VarDecl {{.*}} col:9 e2 'hlsl_constant float' float e2; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_1 definition // CHECK: FieldDecl {{.*}} a2 'float' @@ -81,11 +81,11 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layou // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB { - // CHECK: VarDecl {{.*}} col:5 used s1 'A' + // CHECK: VarDecl {{.*}} col:5 used s1 'hlsl_constant A' A s1; - // CHECK: VarDecl {{.*}} col:5 s2 'B' + // CHECK: VarDecl {{.*}} col:5 s2 'hlsl_constant B' B s2; - // CHECK: VarDecl {{.*}} col:12 s3 'CTypedef':'C + // CHECK: VarDecl {{.*}} col:12 s3 'CTypedef':'C' CTypedef s3; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_2 definition // CHECK: FieldDecl {{.*}} s1 'A' @@ -102,7 +102,7 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layou // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB { - // CHECK: VarDecl {{.*}} s4 'D' + // CHECK: VarDecl {{.*}} s4 'hlsl_constant D' D s4; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_3 definition // CHECK: FieldDecl {{.*}} s4 '__layout_D' @@ -120,9 +120,9 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layou // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB { - // CHECK: VarDecl {{.*}} s5 'E' + // CHECK: VarDecl {{.*}} s5 'hlsl_constant E' E s5; - // CHECK: VarDecl {{.*}} s6 'BTypedef':'B' + // CHECK: VarDecl {{.*}} s6 'hlsl_constant BTypedef':'hlsl_constant B' BTypedef s6; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_4 definition // CHECK: FieldDecl {{.*}} s5 '__layout_E' @@ -158,7 +158,7 @@ cbuffer CB { // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB { - // CHECK: VarDecl {{.*}} s8 'F' + // CHECK: VarDecl {{.*}} s8 'hlsl_constant F' F s8; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_6 definition // CHECK: FieldDecl {{.*}} s8 '__layout_F' @@ -182,7 +182,7 @@ cbuffer CB { // CHECK: FieldDecl {{.*}} f 'RWBuffer':'hlsl::RWBuffer' RWBuffer f; } s9; - // CHECK: VarDecl {{.*}} s9 'struct (unnamed struct at {{.*}}cbuffer.hlsl:177:3 + // CHECK: VarDecl {{.*}} s9 'hlsl_constant struct (unnamed struct at {{.*}}cbuffer.hlsl:177:3 // CHECK: CXXRecordDecl {{.*}} struct definition struct { // CHECK: FieldDecl {{.*}} g 'float' @@ -190,7 +190,7 @@ cbuffer CB { // CHECK: FieldDecl {{.*}} f 'RWBuffer':'hlsl::RWBuffer' RWBuffer f; } s10; - // CHECK: VarDecl {{.*}} s10 'struct (unnamed struct at {{.*}}cbuffer.hlsl:187:3 + // CHECK: VarDecl {{.*}} s10 'hlsl_constant struct (unnamed struct at {{.*}}cbuffer.hlsl:187:3 // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon definition // CHECK: FieldDecl {{.*}} e 'float' // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon_1 definition diff --git a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl index 4b1bbea736f85..12ce327d8be02 100644 --- a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl +++ b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl @@ -35,11 +35,11 @@ struct Foo { // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB1 { - // CHECK: VarDecl {{.*}} foo1 'Foo' + // CHECK: VarDecl {{.*}} foo1 'hlsl_constant Foo' Foo foo1; - // CHECK: VarDecl {{.*}} foo2 'NS1::Foo' + // CHECK: VarDecl {{.*}} foo2 'hlsl_constant NS1::Foo' NS1::Foo foo2; - // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo' + // CHECK: VarDecl {{.*}} foo3 'hlsl_constant NS1::Bar::Foo' NS1::Bar::Foo foo3; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB1 definition // CHECK: FieldDecl {{.*}} foo1 '__layout_Foo' @@ -65,13 +65,13 @@ namespace NS2 { // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer cbuffer CB2 { - // CHECK: VarDecl {{.*}} foo0 '::Foo':'Foo' + // CHECK: VarDecl {{.*}} foo0 'hlsl_constant ::Foo':'hlsl_constant Foo' ::Foo foo0; - // CHECK: VarDecl {{.*}} foo1 'Foo':'NS2::Foo' + // CHECK: VarDecl {{.*}} foo1 'hlsl_constant Foo':'hlsl_constant NS2::Foo' Foo foo1; - // CHECK: VarDecl {{.*}} foo2 'NS1::Foo' + // CHECK: VarDecl {{.*}} foo2 'hlsl_constant NS1::Foo' NS1::Foo foo2; - // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo' + // CHECK: VarDecl {{.*}} foo3 'hlsl_constant NS1::Bar::Foo' NS1::Bar::Foo foo3; // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB2 definition // CHECK: FieldDecl {{.*}} foo0 '__layout_Foo' diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl index 9c928bd6d922e..a9bb90bb386f9 100644 --- a/clang/test/AST/HLSL/packoffset.hlsl +++ b/clang/test/AST/HLSL/packoffset.hlsl @@ -6,13 +6,13 @@ cbuffer A { // CHECK-NEXT:-HLSLResourceClassAttr {{.*}} <> Implicit CBuffer // CHECK-NEXT:-HLSLResourceAttr {{.*}} <> Implicit CBuffer - // CHECK-NEXT: VarDecl {{.*}} A1 'float4' + // CHECK-NEXT: VarDecl {{.*}} A1 'hlsl_constant float4' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0 float4 A1 : packoffset(c); - // CHECK-NEXT: VarDecl {{.*}} col:11 A2 'float' + // CHECK-NEXT: VarDecl {{.*}} col:11 A2 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 0 float A2 : packoffset(c1); - // CHECK-NEXT: VarDecl {{.*}} col:11 A3 'float' + // CHECK-NEXT: VarDecl {{.*}} col:11 A3 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 1 float A3 : packoffset(c1.y); } @@ -20,13 +20,13 @@ cbuffer A // CHECK: HLSLBufferDecl {{.*}} cbuffer B cbuffer B { - // CHECK: VarDecl {{.*}} B0 'float' + // CHECK: VarDecl {{.*}} B0 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1 float B0 : packoffset(c0.g); - // CHECK-NEXT: VarDecl {{.*}} B1 'double' + // CHECK-NEXT: VarDecl {{.*}} B1 'hlsl_constant double' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 2 double B1 : packoffset(c0.b); - // CHECK-NEXT: VarDecl {{.*}} B2 'half' + // CHECK-NEXT: VarDecl {{.*}} B2 'hlsl_constant half' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0 half B2 : packoffset(c0.r); } @@ -34,13 +34,13 @@ cbuffer B // CHECK: HLSLBufferDecl {{.*}} cbuffer C cbuffer C { - // CHECK: VarDecl {{.*}} C0 'float' + // CHECK: VarDecl {{.*}} C0 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 float C0 : packoffset(c0.y); - // CHECK-NEXT: VarDecl {{.*}} C1 'float2' + // CHECK-NEXT: VarDecl {{.*}} C1 'hlsl_constant float2' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 float2 C1 : packoffset(c0.z); - // CHECK-NEXT: VarDecl {{.*}} C2 'half' + // CHECK-NEXT: VarDecl {{.*}} C2 'hlsl_constant half' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 half C2 : packoffset(c0.x); } @@ -49,16 +49,16 @@ cbuffer C // CHECK: HLSLBufferDecl {{.*}} cbuffer D cbuffer D { - // CHECK: VarDecl {{.*}} D0 'float' + // CHECK: VarDecl {{.*}} D0 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1 float D0 : packoffset(c0.y); - // CHECK-NEXT: VarDecl {{.*}} D1 'float[2]' + // CHECK-NEXT: VarDecl {{.*}} D1 'hlsl_constant float[2]' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 0 float D1[2] : packoffset(c1.x); - // CHECK-NEXT: VarDecl {{.*}} D2 'half3' + // CHECK-NEXT: VarDecl {{.*}} D2 'hlsl_constant half3' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 1 half3 D2 : packoffset(c2.y); - // CHECK-NEXT: VarDecl {{.*}} D3 'double' + // CHECK-NEXT: VarDecl {{.*}} D3 'hlsl_constant double' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 2 double D3 : packoffset(c0.z); } @@ -71,13 +71,13 @@ struct ST { // CHECK: HLSLBufferDecl {{.*}} cbuffer S cbuffer S { - // CHECK: VarDecl {{.*}} S0 'float' + // CHECK: VarDecl {{.*}} S0 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1 float S0 : packoffset(c0.y); - // CHECK: VarDecl {{.*}} S1 'ST' + // CHECK: VarDecl {{.*}} S1 'hlsl_constant ST' // CHECK: HLSLPackOffsetAttr {{.*}} 1 0 ST S1 : packoffset(c1); - // CHECK: VarDecl {{.*}} S2 'double2' + // CHECK: VarDecl {{.*}} S2 'hlsl_constant double2' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 0 double2 S2 : packoffset(c2); } @@ -90,13 +90,13 @@ struct ST2 { // CHECK: HLSLBufferDecl {{.*}} cbuffer S2 cbuffer S2 { - // CHECK: VarDecl {{.*}} S20 'float' + // CHECK: VarDecl {{.*}} S20 'hlsl_constant float' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 3 float S20 : packoffset(c0.a); - // CHECK: VarDecl {{.*}} S21 'ST2' + // CHECK: VarDecl {{.*}} S21 'hlsl_constant ST2' // CHECK: HLSLPackOffsetAttr {{.*}} 1 0 ST2 S21 : packoffset(c1); - // CHECK: VarDecl {{.*}} S22 'half' + // CHECK: VarDecl {{.*}} S22 'hlsl_constant half' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 3 1 half S22 : packoffset(c3.y); } diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl index 3eabbb1f8ae22..98d7aba397852 100644 --- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl +++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl @@ -20,14 +20,14 @@ float foo() { // CHECK: HLSLBufferDecl {{.*}} line:7:9 imported cbuffer A // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer // CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer -// CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'float' +// CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'hlsl_constant float' // CHECK-NEXT: CXXRecordDecl {{.*}} imported implicit class __layout_A definition // CHECK: FieldDecl {{.*}} imported a 'float' // CHECK: HLSLBufferDecl {{.*}} line:11:9 imported tbuffer B // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV // CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer -// CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'float' +// CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'hlsl_constant float' // CHECK-NEXT: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} imported implicit class __layout_B definition // CHECK: FieldDecl 0x{{[0-9a-f]+}} {{.*}} imported b 'float' @@ -36,6 +36,6 @@ float foo() { // CHECK-NEXT: ReturnStmt {{.*}} // CHECK-NEXT: BinaryOperator {{.*}} 'float' '+' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' -// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[A]] 'a' 'float' +// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl_constant float' lvalue Var 0x[[A]] 'a' 'hlsl_constant float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' -// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[B]] 'b' 'float' +// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl_constant float' lvalue Var 0x[[B]] 'b' 'hlsl_constant float' diff --git a/clang/test/AST/HLSL/resource_binding_attr.hlsl b/clang/test/AST/HLSL/resource_binding_attr.hlsl index 13957ad3c1fcc..6fac903f75e18 100644 --- a/clang/test/AST/HLSL/resource_binding_attr.hlsl +++ b/clang/test/AST/HLSL/resource_binding_attr.hlsl @@ -4,7 +4,7 @@ // CHECK-NEXT:HLSLResourceClassAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit CBuffer // CHECK-NEXT:HLSLResourceAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit CBuffer // CHECK-NEXT:HLSLResourceBindingAttr 0x{{[0-9a-f]+}} "b3" "space2" -// CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'float' +// CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'hlsl_constant float' cbuffer CB : register(b3, space2) { float a; } @@ -13,7 +13,7 @@ cbuffer CB : register(b3, space2) { // CHECK-NEXT:HLSLResourceClassAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit SRV // CHECK-NEXT:HLSLResourceAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit TBuffer // CHECK-NEXT:HLSLResourceBindingAttr 0x{{[0-9a-f]+}} "t2" "space1" -// CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'float' +// CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'hlsl_constant float' tbuffer TB : register(t2, space1) { float b; } @@ -21,9 +21,9 @@ tbuffer TB : register(t2, space1) { float foo() { // CHECK: BinaryOperator 0x{{[0-9a-f]+}} 'float' '+' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} 'float' -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} 'float' lvalue Var 0x[[A]] 'a' 'float' +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} 'hlsl_constant float' lvalue Var 0x[[A]] 'a' 'hlsl_constant float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} 'float' -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} 'float' lvalue Var 0x[[B]] 'b' 'float' +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} 'hlsl_constant float' lvalue Var 0x[[B]] 'b' 'hlsl_constant float' return a + b; } diff --git a/clang/test/CodeGenHLSL/cbuf.hlsl b/clang/test/CodeGenHLSL/cbuf.hlsl index 3f9d4514967dd..825e7b8161a60 100644 --- a/clang/test/CodeGenHLSL/cbuf.hlsl +++ b/clang/test/CodeGenHLSL/cbuf.hlsl @@ -1,7 +1,14 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s + +// CHECK: @a = external addrspace(2) externally_initialized global float, align 4 +// CHECK: @b = external addrspace(2) externally_initialized global double, align 8 +// CHECK: @c = external addrspace(2) externally_initialized global float, align 4 +// CHECK: @d = external addrspace(2) externally_initialized global double, align 8 + // CHECK: @[[CB:.+]] = external constant { float, double } cbuffer A : register(b0, space2) { float a; @@ -15,10 +22,10 @@ tbuffer A : register(t2, space1) { } float foo() { -// CHECK: load float, ptr @[[CB]], align 4 -// CHECK: load double, ptr getelementptr ({ float, double }, ptr @[[CB]], i32 0, i32 1), align 8 -// CHECK: load float, ptr @[[TB]], align 4 -// CHECK: load double, ptr getelementptr ({ float, double }, ptr @[[TB]], i32 0, i32 1), align 8 +// CHECK: load float, ptr addrspace(2) @a, align 4 +// CHECK: load double, ptr addrspace(2) @b, align 8 +// CHECK: load float, ptr addrspace(2) @c, align 4 +// CHECK: load double, ptr addrspace(2) @d, align 8 return a + b + c*d; } diff --git a/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl b/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl index 73dc376942dfb..13c401d428331 100644 --- a/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl +++ b/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl @@ -1,8 +1,14 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s + +// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // Make sure cbuffer inside namespace works. + +// CHECK: @_ZN2n02n11aE = external addrspace(2) externally_initialized global float, align 4 +// CHECK: @_ZN2n01bE = external addrspace(2) externally_initialized global float, align 4 + // CHECK: @[[CB:.+]] = external constant { float } // CHECK: @[[TB:.+]] = external constant { float } namespace n0 { @@ -17,7 +23,7 @@ namespace n1 { } float foo() { -// CHECK: load float, ptr @[[CB]], align 4 -// CHECK: load float, ptr @[[TB]], align 4 +// CHECK: load float, ptr addrspace(2) @_ZN2n02n11aE, align 4 +// CHECK: load float, ptr addrspace(2) @_ZN2n01bE, align 4 return n0::n1::a + n0::b; } diff --git a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl index f85bab2113170..25f51cce2017d 100644 --- a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl +++ b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl @@ -1,16 +1,21 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s -// CHECK-DAG: @[[CB:.+]] = external constant { float } +// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s cbuffer A { - float a; - // CHECK-DAG:@_ZL1b = internal global float 3.000000e+00, align 4 + // CHECK: @a = external addrspace(2) externally_initialized global float, align 4 + float a; + // CHECK: @_ZL1b = internal global float 3.000000e+00, align 4 static float b = 3; - // CHECK:load float, ptr @[[CB]], align 4 - // CHECK:load float, ptr @_ZL1b, align 4 float foo() { return a + b; } } +// CHECK: @[[CB:.+]] = external constant { float } + +// CHECK:define {{.*}} float @_Z3foov() +// CHECK:load float, ptr addrspace(2) @a, align 4 +// CHECK:load float, ptr @_ZL1b, align 4 float bar() { return foo(); diff --git a/clang/test/SemaTemplate/address_space-dependent.cpp b/clang/test/SemaTemplate/address_space-dependent.cpp index 2ca9b8007ab41..eb8dbc69a945e 100644 --- a/clang/test/SemaTemplate/address_space-dependent.cpp +++ b/clang/test/SemaTemplate/address_space-dependent.cpp @@ -43,7 +43,7 @@ void neg() { template void tooBig() { - __attribute__((address_space(I))) int *bounds; // expected-error {{address space is larger than the maximum supported (8388586)}} + __attribute__((address_space(I))) int *bounds; // expected-error {{address space is larger than the maximum supported (8388585)}} } template From 07ed8187acc31ac3f4779da452864a29d48799ac Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 24 Jan 2025 16:56:10 -0800 Subject: [PATCH 071/432] [OpenMP] Replace nvvm.annotation usage with kernel calling conventions (#122320) Specifying a kernel with the `ptx_kernel` or `amdgpu_kernel` calling convention is a more idiomatic and compile-time performant than using the `nvvm.annoation !"kernel"` metadata. Transition OMPIRBuilder to use calling conventions for PTX kernels and no longer emit `nvvm.annoation`. Update OpenMPOpt to work with kernels specified via calling convention as well as metadata. Update OpenMP tests to use the calling conventions. --- clang/test/OpenMP/assumes_include_nvptx.cpp | 4 +- .../nvptx_target_firstprivate_codegen.cpp | 2 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 16 +- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 61 +- .../Transforms/OpenMP/always_inline_device.ll | 17 +- .../attributor_module_slice_reproducer.ll | 6 +- .../test/Transforms/OpenMP/barrier_removal.ll | 146 +- llvm/test/Transforms/OpenMP/bug66687.ll | 11 +- .../OpenMP/custom_state_machines.ll | 81 +- .../OpenMP/custom_state_machines_pre_lto.ll | 109 +- .../OpenMP/custom_state_machines_remarks.ll | 7 +- .../Transforms/OpenMP/deduplication_target.ll | 4 +- .../get_hardware_num_threads_in_block_fold.ll | 13 +- ...dware_num_threads_in_block_fold_optnone.ll | 13 +- .../Transforms/OpenMP/global_constructor.ll | 13 +- .../OpenMP/globalization_remarks.ll | 4 +- .../OpenMP/gpu_kernel_detection_remarks.ll | 9 +- ..._state_machine_function_ptr_replacement.ll | 4 +- .../OpenMP/is_spmd_exec_mode_fold.ll | 17 +- .../Transforms/OpenMP/nested_parallelism.ll | 13 +- .../Transforms/OpenMP/parallel_level_fold.ll | 13 +- .../Transforms/OpenMP/remove_globalization.ll | 26 +- .../OpenMP/replace_globalization.ll | 21 +- .../OpenMP/single_threaded_execution.ll | 4 +- llvm/test/Transforms/OpenMP/spmdization.ll | 1607 +++-------------- .../Transforms/OpenMP/spmdization_assumes.ll | 27 +- .../OpenMP/spmdization_constant_prop.ll | 2 - .../Transforms/OpenMP/spmdization_guarding.ll | 106 +- ...mdization_guarding_two_reaching_kernels.ll | 36 +- .../Transforms/OpenMP/spmdization_indirect.ll | 161 +- ...zation_no_guarding_two_reaching_kernels.ll | 35 +- .../Transforms/OpenMP/spmdization_remarks.ll | 7 +- .../OpenMP/value-simplify-openmp-opt.ll | 96 +- .../Frontend/OpenMPIRBuilderTest.cpp | 20 - 34 files changed, 606 insertions(+), 2105 deletions(-) diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp index 4577ea4c9c2b5..c5040989a0e40 100644 --- a/clang/test/OpenMP/assumes_include_nvptx.cpp +++ b/clang/test/OpenMP/assumes_include_nvptx.cpp @@ -11,11 +11,11 @@ // TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated. -// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]] +// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]] // CHECK: call i32 @__kmpc_target_init( // CHECK: declare noundef float @_Z3sinf(float noundef) [[attr1:#[0-9]*]] // CHECK: declare void @__kmpc_target_deinit( -// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]] +// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]] // CHECK: %call = call noundef double @_Z3sind(double noundef 0.000000e+00) [[attr2:#[0-9]]] // CHECK: declare noundef double @_Z3sind(double noundef) [[attr1]] diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp index d573f1cd193d6..94ace20826db4 100644 --- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp @@ -90,7 +90,7 @@ int foo(int n, double *ptr) { ptr[0]++; } - // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]]) + // TCHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]]) // TCHECK: [[DYN_PTR_ADDR:%.+]] = alloca ptr, // TCHECK: [[PTR_ADDR:%.+]] = alloca ptr, // TCHECK-NOT: alloca ptr, diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 777391327f77c..8cc3a99d92023 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6468,6 +6468,8 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility); if (T.isAMDGCN()) OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL); + else if (T.isNVPTX()) + OutlinedFn->setCallingConv(CallingConv::PTX_Kernel); } } @@ -9223,20 +9225,8 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, if (!Fn) return; - Module &M = *(Fn->getParent()); - LLVMContext &Ctx = M.getContext(); - - // Get "nvvm.annotations" metadata node. - NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - - Metadata *MDVals[] = { - ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"), - ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))}; - // Append metadata to nvvm.annotations. - MD->addOperand(MDNode::get(Ctx, MDVals)); - // Add a function attribute for the kernel. - Fn->addFnAttr(Attribute::get(Ctx, "kernel")); + Fn->addFnAttr("kernel"); if (T.isAMDGCN()) Fn->addFnAttr("uniform-work-group-size", "true"); Fn->addFnAttr(Attribute::MustProgress); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index e7221ee406a18..10008130016c3 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -19,6 +19,7 @@ #include "llvm/Transforms/IPO/OpenMPOpt.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" @@ -36,6 +37,7 @@ #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/Assumptions.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" @@ -5903,34 +5905,51 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) { return Fn.hasFnAttribute("kernel"); } +static bool isKernelCC(Function &F) { + switch (F.getCallingConv()) { + default: + return false; + case CallingConv::PTX_Kernel: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } +} + KernelSet llvm::omp::getDeviceKernels(Module &M) { // TODO: Create a more cross-platform way of determining device kernels. - NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); KernelSet Kernels; - if (!MD) - return Kernels; + DenseSet SeenKernels; + auto ProcessKernel = [&](Function &KF) { + if (SeenKernels.insert(&KF).second) { + // We are only interested in OpenMP target regions. Others, such as + // kernels generated by CUDA but linked together, are not interesting to + // this pass. + if (isOpenMPKernel(KF)) { + ++NumOpenMPTargetRegionKernels; + Kernels.insert(&KF); + } else + ++NumNonOpenMPTargetRegionKernels; + } + }; - for (auto *Op : MD->operands()) { - if (Op->getNumOperands() < 2) - continue; - MDString *KindID = dyn_cast(Op->getOperand(1)); - if (!KindID || KindID->getString() != "kernel") - continue; + if (NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations")) + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() < 2) + continue; + MDString *KindID = dyn_cast(Op->getOperand(1)); + if (!KindID || KindID->getString() != "kernel") + continue; - Function *KernelFn = - mdconst::dyn_extract_or_null(Op->getOperand(0)); - if (!KernelFn) - continue; + if (auto *KernelFn = + mdconst::dyn_extract_or_null(Op->getOperand(0))) + ProcessKernel(*KernelFn); + } - // We are only interested in OpenMP target regions. Others, such as kernels - // generated by CUDA but linked together, are not interesting to this pass. - if (isOpenMPKernel(*KernelFn)) { - ++NumOpenMPTargetRegionKernels; - Kernels.insert(KernelFn); - } else - ++NumNonOpenMPTargetRegionKernels; - } + for (Function &F : M) + if (isKernelCC(F)) + ProcessKernel(F); return Kernels; } diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll index 6028ff5278037..9c5b19f7a6c88 100644 --- a/llvm/test/Transforms/OpenMP/always_inline_device.ll +++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll @@ -17,7 +17,7 @@ ; CHECK: @G = external global i8 ; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ;. -define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 { ; CHECK: Function Attrs: norecurse nounwind ; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4( ; CHECK-NEXT: entry: @@ -79,12 +79,10 @@ attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-wi attributes #2 = { convergent } !omp_offload.info = !{!0} -!nvvm.annotations = !{!1} !llvm.module.flags = !{!2, !3, !4, !5, !6} !llvm.ident = !{!7} !0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0} -!1 = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1} !2 = !{i32 1, !"wchar_size", i32 4} !3 = !{i32 7, !"openmp", i32 50} !4 = !{i32 7, !"openmp-device", i32 50} @@ -97,11 +95,10 @@ attributes #2 = { convergent } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0} -; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1} -; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} ;. diff --git a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll index 9c0416af359d4..3f4790ee15ac8 100644 --- a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll +++ b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll @@ -13,10 +13,6 @@ define linkonce_odr hidden i8 @_ZStplIdESt7complexIT_ERKS2_S4_() local_unnamed_a ret i8 undef } -declare void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr +declare ptx_kernel void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr declare dso_local fastcc void @__kmpc_for_static_init_8u() unnamed_addr - -!nvvm.annotations = !{!0} - -!0 = !{ptr @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148, !"kernel", i32 1} diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll index 47a5d5104aa8b..5b7544b1a7961 100644 --- a/llvm/test/Transforms/OpenMP/barrier_removal.ll +++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -28,7 +28,7 @@ declare void @llvm.assume(i1) ; CHECK: @G1 = global i32 42 ; CHECK: @G2 = addrspace(1) global i32 0 ;. -define void @pos_empty_1(i1 %c) "kernel" { +define amdgpu_kernel void @pos_empty_1(i1 %c) "kernel" { ; MODULE-LABEL: define {{[^@]+}}@pos_empty_1 ; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] { ; MODULE-NEXT: ret void @@ -45,7 +45,7 @@ define void @pos_empty_1(i1 %c) "kernel" { call void @llvm.assume(i1 %c) ret void } -define void @pos_empty_2() "kernel" { +define amdgpu_kernel void @pos_empty_2() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_2 ; CHECK-SAME: () #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: ret void @@ -53,7 +53,7 @@ define void @pos_empty_2() "kernel" { call void @aligned_barrier() ret void } -define void @pos_empty_3() "kernel" { +define amdgpu_kernel void @pos_empty_3() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_3 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void @@ -61,7 +61,7 @@ define void @pos_empty_3() "kernel" { call void @llvm.nvvm.barrier0() ret void } -define void @pos_empty_4() "kernel" { +define amdgpu_kernel void @pos_empty_4() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_4 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void @@ -69,7 +69,7 @@ define void @pos_empty_4() "kernel" { call i32 @llvm.nvvm.barrier0.and(i32 0) ret void } -define void @pos_empty_5() "kernel" { +define amdgpu_kernel void @pos_empty_5() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_5 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void @@ -77,7 +77,7 @@ define void @pos_empty_5() "kernel" { call i32 @llvm.nvvm.barrier0.or(i32 0) ret void } -define void @pos_empty_6() "kernel" { +define amdgpu_kernel void @pos_empty_6() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_6 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void @@ -85,7 +85,7 @@ define void @pos_empty_6() "kernel" { call i32 @llvm.nvvm.barrier0.popc(i32 0) ret void } -define void @pos_empty_7a() "kernel" { +define amdgpu_kernel void @pos_empty_7a() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_7a ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @unknown() @@ -96,7 +96,7 @@ define void @pos_empty_7a() "kernel" { ret void } ; FIXME: We should remove the barrier. -define void @pos_empty_7b() "kernel" { +define amdgpu_kernel void @pos_empty_7b() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_7b ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @unknown() #[[ATTR5:[0-9]+]] @@ -109,7 +109,7 @@ define void @pos_empty_7b() "kernel" { call void @unknown() ret void } -define void @pos_empty_8(i1 %c) "kernel" { +define amdgpu_kernel void @pos_empty_8(i1 %c) "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_8 ; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] { ; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -126,7 +126,7 @@ t: f: ret void } -define void @neg_empty_8() "kernel" { +define amdgpu_kernel void @neg_empty_8() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@neg_empty_8 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @unknown() @@ -137,7 +137,7 @@ define void @neg_empty_8() "kernel" { call void @llvm.amdgcn.s.barrier() ret void } -define void @neg_empty_9(i1 %c) "kernel" { +define amdgpu_kernel void @neg_empty_9(i1 %c) "kernel" { ; CHECK-LABEL: define {{[^@]+}}@neg_empty_9 ; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] { ; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -173,7 +173,7 @@ m: ret void } ; FIXME: We should remove the barrier -define void @pos_empty_10() "kernel" { +define amdgpu_kernel void @pos_empty_10() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_10 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: br label [[M:%.*]] @@ -186,7 +186,7 @@ m: call void @llvm.amdgcn.s.barrier() ret void } -define void @pos_empty_11() "kernel" { +define amdgpu_kernel void @pos_empty_11() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_empty_11 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: br label [[M:%.*]] @@ -206,7 +206,7 @@ define void @empty() { ret void } ; FIXME: We should remove the barrier in the end but not the first one. -define void @neg_empty_12(i1 %c) "kernel" { +define amdgpu_kernel void @neg_empty_12(i1 %c) "kernel" { ; MODULE-LABEL: define {{[^@]+}}@neg_empty_12 ; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4]] { ; MODULE-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -266,7 +266,7 @@ define void @neg_empty_2() "kernel" { @GC1 = constant i32 42 @GC2 = addrspace(4) global i32 0 @GPtr4 = addrspace(4) global ptr addrspace(4) null -define void @pos_constant_loads() "kernel" { +define amdgpu_kernel void @pos_constant_loads() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_constant_loads ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(4), ptr addrspace(4) @GPtr4, align 8 @@ -296,7 +296,7 @@ define void @pos_constant_loads() "kernel" { @GS = addrspace(3) global i32 0 @GPtr = global ptr null ; TODO: We could remove some of the barriers due to the lack of write effects. -define void @neg_loads() "kernel" { +define amdgpu_kernel void @neg_loads() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@neg_loads ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: [[ARG:%.*]] = load ptr, ptr @GPtr, align 8 @@ -327,7 +327,7 @@ define void @neg_loads() "kernel" { @PG1 = thread_local global i32 42 @PG2 = addrspace(5) global i32 0 @GPtr5 = global ptr addrspace(5) null -define void @pos_priv_mem() "kernel" { +define amdgpu_kernel void @pos_priv_mem() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4 @@ -358,7 +358,7 @@ define void @pos_priv_mem() "kernel" { } @G1 = global i32 42 @G2 = addrspace(1) global i32 0 -define void @neg_mem() "kernel" { +define amdgpu_kernel void @neg_mem() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@neg_mem ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: [[ARG:%.*]] = load ptr, ptr @GPtr, align 8 @@ -388,7 +388,7 @@ define void @neg_mem() "kernel" { ret void } -define void @pos_multiple() "kernel" { +define amdgpu_kernel void @pos_multiple() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@pos_multiple ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void @@ -404,7 +404,7 @@ define void @pos_multiple() "kernel" { ret void } -define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" { +define amdgpu_kernel void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" { ; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_1 ; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR4]] { ; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]] @@ -461,7 +461,7 @@ m: ret void } -define void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" { +define amdgpu_kernel void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" { ; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_2 ; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] { ; CHECK-NEXT: store i32 4, ptr [[P]], align 4 @@ -727,7 +727,7 @@ define internal void @barrier_then_write_then_barrier0(ptr %p) { call void @aligned_barrier() ret void } -define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" { +define amdgpu_kernel void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" { ; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0 ; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] { ; MODULE-NEXT: call void @barrier_then_write_then_barrier0(ptr [[P]]) @@ -1040,7 +1040,7 @@ define internal void @callee_barrier() { call void @aligned_barrier() ret void } -define void @caller_barrier1() "kernel" { +define amdgpu_kernel void @caller_barrier1() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@caller_barrier1 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @callee_barrier() @@ -1051,7 +1051,7 @@ define void @caller_barrier1() "kernel" { call void @aligned_barrier() ret void } -define void @caller_barrier2() "kernel" { +define amdgpu_kernel void @caller_barrier2() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@caller_barrier2 ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @unknown() @@ -1065,7 +1065,7 @@ define void @caller_barrier2() "kernel" { ret void } -define void @loop_barrier() "kernel" { +define amdgpu_kernel void @loop_barrier() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@loop_barrier ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: @@ -1095,7 +1095,7 @@ exit: ret void } -define void @loop_barrier_end_barriers() "kernel" { +define amdgpu_kernel void @loop_barrier_end_barriers() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: @@ -1129,7 +1129,7 @@ exit: ret void } -define void @loop_barrier_end_barriers_unknown() "kernel" { +define amdgpu_kernel void @loop_barrier_end_barriers_unknown() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_unknown ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: @@ -1165,7 +1165,7 @@ exit: ret void } -define void @loop_barrier_store() "kernel" { +define amdgpu_kernel void @loop_barrier_store() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_store ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: @@ -1195,7 +1195,7 @@ exit: ret void } -define void @loop_barrier_end_barriers_store() "kernel" { +define amdgpu_kernel void @loop_barrier_end_barriers_store() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_store ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: entry: @@ -1232,37 +1232,7 @@ exit: } !llvm.module.flags = !{!16,!15} -!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26,!27,!28,!29,!30} -!0 = !{ptr @pos_empty_1, !"kernel", i32 1} -!1 = !{ptr @pos_empty_2, !"kernel", i32 1} -!2 = !{ptr @pos_empty_3, !"kernel", i32 1} -!3 = !{ptr @pos_empty_4, !"kernel", i32 1} -!4 = !{ptr @pos_empty_5, !"kernel", i32 1} -!5 = !{ptr @pos_empty_6, !"kernel", i32 1} -!17 = !{ptr @pos_empty_7a, !"kernel", i32 1} -!18 = !{ptr @pos_empty_7b, !"kernel", i32 1} -!23 = !{ptr @pos_empty_8, !"kernel", i32 1} -!24 = !{ptr @caller_barrier1, !"kernel", i32 1} -!25 = !{ptr @caller_barrier2, !"kernel", i32 1} -!26 = !{ptr @loop_barrier, !"kernel", i32 1} -!27 = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1} -!28 = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1} -!29 = !{ptr @loop_barrier_store, !"kernel", i32 1} -!30 = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1} -!6 = !{ptr @neg_empty_8, !"kernel", i32 1} -!19 = !{ptr @neg_empty_9, !"kernel", i32 1} -!20 = !{ptr @pos_empty_10, !"kernel", i32 1} -!21 = !{ptr @pos_empty_11, !"kernel", i32 1} -!22 = !{ptr @neg_empty_12, !"kernel", i32 1} -!7 = !{ptr @pos_constant_loads, !"kernel", i32 1} -!8 = !{ptr @neg_loads, !"kernel", i32 1} -!9 = !{ptr @pos_priv_mem, !"kernel", i32 1} -!10 = !{ptr @neg_mem, !"kernel", i32 1} -!11 = !{ptr @pos_multiple, !"kernel", i32 1} -!12 = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1} -!13 = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1} -!14 = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1} !15 = !{i32 7, !"openmp", i32 50} !16 = !{i32 7, !"openmp-device", i32 50} ;. @@ -1282,65 +1252,7 @@ exit: ;. ; MODULE: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ; MODULE: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; MODULE: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1} -; MODULE: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1} -; MODULE: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1} -; MODULE: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1} -; MODULE: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1} -; MODULE: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1} -; MODULE: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1} -; MODULE: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1} -; MODULE: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1} -; MODULE: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1} -; MODULE: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1} -; MODULE: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1} -; MODULE: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1} -; MODULE: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1} -; MODULE: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1} -; MODULE: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1} -; MODULE: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1} -; MODULE: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1} -; MODULE: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1} -; MODULE: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1} -; MODULE: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1} -; MODULE: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1} -; MODULE: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1} -; MODULE: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1} -; MODULE: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1} -; MODULE: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1} -; MODULE: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1} -; MODULE: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1} -; MODULE: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1} ;. ; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CGSCC: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1} -; CGSCC: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1} -; CGSCC: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1} -; CGSCC: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1} -; CGSCC: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1} -; CGSCC: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1} -; CGSCC: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1} -; CGSCC: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1} -; CGSCC: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1} -; CGSCC: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1} -; CGSCC: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1} -; CGSCC: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1} -; CGSCC: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1} -; CGSCC: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1} -; CGSCC: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1} -; CGSCC: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1} -; CGSCC: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1} -; CGSCC: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1} -; CGSCC: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1} -; CGSCC: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1} -; CGSCC: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1} -; CGSCC: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1} -; CGSCC: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1} -; CGSCC: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1} -; CGSCC: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1} -; CGSCC: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1} -; CGSCC: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1} -; CGSCC: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1} -; CGSCC: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1} ;. diff --git a/llvm/test/Transforms/OpenMP/bug66687.ll b/llvm/test/Transforms/OpenMP/bug66687.ll index e0a9b825a8804..9bb069b1735be 100644 --- a/llvm/test/Transforms/OpenMP/bug66687.ll +++ b/llvm/test/Transforms/OpenMP/bug66687.ll @@ -5,25 +5,22 @@ source_filename = "bug66687.ll" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -define weak void @openmp_kernel() "kernel" { -; CHECK-LABEL: define weak void @openmp_kernel( +define weak ptx_kernel void @openmp_kernel() "kernel" { +; CHECK-LABEL: define weak ptx_kernel void @openmp_kernel( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void } -define weak_odr void @non_openmp_kernel() { -; CHECK-LABEL: define weak_odr void @non_openmp_kernel() { +define weak_odr ptx_kernel void @non_openmp_kernel() { +; CHECK-LABEL: define weak_odr ptx_kernel void @non_openmp_kernel() { ; CHECK-NEXT: ret void ; ret void } !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3} !0 = !{i32 7, !"openmp", i32 51} !1 = !{i32 7, !"openmp-device", i32 51} -!2 = !{ptr @openmp_kernel, !"kernel", i32 1} -!3 = !{ptr @non_openmp_kernel, !"kernel", i32 1} diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll index e6ddf16f06763..10e521bbfcc10 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -138,7 +138,7 @@ @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } -define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -197,7 +197,7 @@ declare i32 @__kmpc_global_thread_num(ptr) #3 declare void @__kmpc_target_deinit() -define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -290,7 +290,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -367,7 +367,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -453,7 +453,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -537,7 +537,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -624,7 +624,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -679,7 +679,7 @@ return: ; preds = %if.end, %if.then declare i32 @omp_get_thread_num(...) #4 -define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -812,7 +812,6 @@ attributes #8 = { convergent "llvm.assume"="omp_no_openmp" } attributes #9 = { convergent nounwind readonly willreturn } !omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7} -!nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15} !llvm.module.flags = !{!16, !17, !18} !0 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} @@ -823,14 +822,6 @@ attributes #9 = { convergent nounwind readonly willreturn } !5 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} !6 = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} !7 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -!8 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -!9 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -!10 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -!11 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -!12 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -!13 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -!14 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -!15 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} !16 = !{i32 1, !"wchar_size", i32 4} !17 = !{i32 7, !"openmp", i32 50} !18 = !{i32 7, !"openmp-device", i32 50} @@ -4107,17 +4098,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; AMDGPU: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; AMDGPU: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; AMDGPU: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; AMDGPU: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; AMDGPU: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; AMDGPU: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; AMDGPU: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; AMDGPU: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; AMDGPU: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; AMDGPU: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; AMDGPU: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -4127,17 +4110,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; NVPTX: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; NVPTX: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; NVPTX: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; NVPTX: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; NVPTX: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; NVPTX: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; NVPTX: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; NVPTX: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; NVPTX: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; NVPTX: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; NVPTX: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -4147,17 +4122,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU-DISABLED: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -4167,15 +4134,7 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; NVPTX-DISABLED: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; NVPTX-DISABLED: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; NVPTX-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; NVPTX-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; NVPTX-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; NVPTX-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; NVPTX-DISABLED: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; NVPTX-DISABLED: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; NVPTX-DISABLED: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; NVPTX-DISABLED: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX-DISABLED: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX-DISABLED: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX-DISABLED: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX-DISABLED: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX-DISABLED: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll index d20821d450365..9576ff6ca6aee 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll @@ -139,7 +139,7 @@ @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } -define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -196,7 +196,7 @@ declare i32 @__kmpc_global_thread_num(ptr) #3 declare void @__kmpc_target_deinit() -define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -289,7 +289,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -366,7 +366,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -452,7 +452,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -536,7 +536,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -623,7 +623,7 @@ entry: ret void } -define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -678,7 +678,7 @@ return: ; preds = %if.end, %if.then declare i32 @omp_get_thread_num(...) #4 -define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -811,7 +811,6 @@ attributes #8 = { convergent "llvm.assume"="omp_no_openmp" } attributes #9 = { convergent nounwind readonly willreturn } !omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7} -!nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15} !llvm.module.flags = !{!16, !17, !18} !0 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} @@ -822,14 +821,6 @@ attributes #9 = { convergent nounwind readonly willreturn } !5 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} !6 = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} !7 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -!8 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -!9 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -!10 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -!11 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -!12 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -!13 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -!14 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -!15 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} !16 = !{i32 1, !"wchar_size", i32 4} !17 = !{i32 7, !"openmp", i32 50} !18 = !{i32 7, !"openmp-device", i32 50} @@ -4976,17 +4967,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU1: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; AMDGPU1: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; AMDGPU1: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; AMDGPU1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; AMDGPU1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; AMDGPU1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; AMDGPU1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; AMDGPU1: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; AMDGPU1: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; AMDGPU1: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; AMDGPU1: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; AMDGPU1: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU1: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU1: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU1: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU1: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU1: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; NVPTX1: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; NVPTX1: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -4996,17 +4979,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX1: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; NVPTX1: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; NVPTX1: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; NVPTX1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; NVPTX1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; NVPTX1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; NVPTX1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; NVPTX1: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; NVPTX1: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; NVPTX1: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; NVPTX1: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; NVPTX1: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX1: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX1: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX1: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX1: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX1: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; AMDGPU2: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; AMDGPU2: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -5016,17 +4991,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU2: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; AMDGPU2: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; AMDGPU2: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; AMDGPU2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; AMDGPU2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; AMDGPU2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; AMDGPU2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; AMDGPU2: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; AMDGPU2: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; AMDGPU2: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; AMDGPU2: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; AMDGPU2: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU2: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU2: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU2: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU2: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU2: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; AMDGPU3: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; AMDGPU3: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -5036,17 +5003,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; AMDGPU3: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; AMDGPU3: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; AMDGPU3: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; AMDGPU3: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; AMDGPU3: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; AMDGPU3: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; AMDGPU3: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; AMDGPU3: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; AMDGPU3: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; AMDGPU3: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; AMDGPU3: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; AMDGPU3: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU3: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU3: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU3: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU3: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU3: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; NVPTX2: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; NVPTX2: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -5056,17 +5015,9 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX2: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; NVPTX2: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; NVPTX2: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; NVPTX2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; NVPTX2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; NVPTX2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; NVPTX2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; NVPTX2: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; NVPTX2: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; NVPTX2: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; NVPTX2: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; NVPTX2: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX2: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX2: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX2: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX2: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX2: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. ; NVPTX3: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2} ; NVPTX3: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4} @@ -5076,15 +5027,7 @@ attributes #9 = { convergent nounwind readonly willreturn } ; NVPTX3: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6} ; NVPTX3: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7} ; NVPTX3: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1} -; NVPTX3: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1} -; NVPTX3: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1} -; NVPTX3: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1} -; NVPTX3: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1} -; NVPTX3: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1} -; NVPTX3: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1} -; NVPTX3: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1} -; NVPTX3: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1} -; NVPTX3: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX3: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX3: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX3: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX3: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX3: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll index f7bfd30650694..ad41639511e99 100644 --- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll @@ -59,7 +59,7 @@ target triple = "nvptx64" ; Function Attrs: convergent norecurse nounwind -define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 { +define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 { entry: %captured_vars_addrs.i.i = alloca [0 x ptr], align 8 %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment, ptr %dyn) #3, !dbg !18 @@ -104,7 +104,7 @@ declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3 declare void @__kmpc_target_deinit() local_unnamed_addr ; Function Attrs: norecurse nounwind -define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 { +define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 { entry: %captured_vars_addrs.i2.i = alloca [0 x ptr], align 8 %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment, ptr %dyn) #3, !dbg !33 @@ -175,7 +175,6 @@ attributes #8 = { "llvm.assume"="omp_no_parallelism" } !llvm.dbg.cu = !{!0} !omp_offload.info = !{!3, !4} -!nvvm.annotations = !{!5, !6} !llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} !llvm.ident = !{!14} @@ -184,8 +183,6 @@ attributes #8 = { "llvm.assume"="omp_no_parallelism" } !2 = !{} !3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1} !4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0} -!5 = !{ptr @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1} -!6 = !{ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1} !7 = !{i32 7, !"Dwarf Version", i32 2} !8 = !{i32 2, !"Debug Info Version", i32 3} !9 = !{i32 1, !"wchar_size", i32 4} diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll index 6b0563365c648..7027c3275b932 100644 --- a/llvm/test/Transforms/OpenMP/deduplication_target.ll +++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll @@ -15,7 +15,7 @@ target triple = "nvptx64" declare void @use(i32) -define weak void @__omp_offloading_50_a3e09bf8_foo_l2(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_50_a3e09bf8_foo_l2(ptr %dyn) #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_50_a3e09bf8_foo_l2 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -56,11 +56,9 @@ attributes #0 = { convergent noinline norecurse nounwind "kernel" "frame-pointer attributes #1 = { nounwind } !omp_offload.info = !{!0} -!nvvm.annotations = !{!1} !llvm.module.flags = !{!2, !3, !4} !0 = !{i32 0, i32 80, i32 -1545561096, !"foo", i32 2, i32 0} -!1 = !{ptr @__omp_offloading_50_a3e09bf8_foo_l2, !"kernel", i32 1} !2 = !{i32 1, !"wchar_size", i32 4} !3 = !{i32 7, !"openmp", i32 50} !4 = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll index 6102201ad4bac..6a4519a161fd6 100644 --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -19,7 +19,7 @@ target triple = "nvptx64" ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 ;. -define weak void @kernel0(ptr %dyn) "kernel" #0 { +define weak ptx_kernel void @kernel0(ptr %dyn) "kernel" #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel0 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel0_kernel_environment, ptr [[DYN]]) @@ -43,7 +43,7 @@ define weak void @kernel0(ptr %dyn) "kernel" #0 { ret void } -define weak void @kernel1(ptr %dyn) "kernel" #0 { +define weak ptx_kernel void @kernel1(ptr %dyn) "kernel" #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel1 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel1_kernel_environment, ptr [[DYN]]) @@ -63,7 +63,7 @@ define weak void @kernel1(ptr %dyn) "kernel" #0 { ret void } -define weak void @kernel2(ptr %dyn) "kernel" #0 { +define weak ptx_kernel void @kernel2(ptr %dyn) "kernel" #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: @@ -200,15 +200,11 @@ declare i32 @__kmpc_global_thread_num(ptr) !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3, !4} attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"} !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} -!2 = !{ptr @kernel0, !"kernel", i32 1} -!3 = !{ptr @kernel1, !"kernel", i32 1} -!4 = !{ptr @kernel2, !"kernel", i32 1} ;. ; CHECK: attributes #[[ATTR0]] = { "kernel" "omp_target_num_teams"="777" "omp_target_thread_limit"="666" } ; CHECK: attributes #[[ATTR1]] = { nounwind } @@ -217,7 +213,4 @@ attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"} ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META2:![0-9]+]] = !{ptr @kernel0, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @kernel1, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1} ;. diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll index 0cf6e7488b4dd..3037d24b8c448 100644 --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll @@ -7,7 +7,7 @@ target triple = "nvptx64" ;. ; CHECK: @G = external global i32 ;. -define weak void @kernel0() #0 { +define weak ptx_kernel void @kernel0() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel0 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null) @@ -25,7 +25,7 @@ define weak void @kernel0() #0 { ret void } -define weak void @kernel1() #0 { +define weak ptx_kernel void @kernel1() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel1 ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null) @@ -39,7 +39,7 @@ define weak void @kernel1() #0 { ret void } -define weak void @kernel2() #0 { +define weak ptx_kernel void @kernel2() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null) @@ -107,15 +107,11 @@ declare void @__kmpc_target_deinit() #1 !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3, !4} attributes #0 = { optnone noinline "kernel" "omp_target_thread_limit"="666" "omp_target_num_teams"="777"} !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} -!2 = !{ptr @kernel0, !"kernel", i32 1} -!3 = !{ptr @kernel1, !"kernel", i32 1} -!4 = !{ptr @kernel2, !"kernel", i32 1} ; ;. ; CHECK: attributes #[[ATTR0]] = { noinline optnone "kernel" "omp_target_num_teams"="777" "omp_target_thread_limit"="666" } @@ -123,7 +119,4 @@ attributes #0 = { optnone noinline "kernel" "omp_target_thread_limit"="666" "omp ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META2:![0-9]+]] = !{ptr @kernel0, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @kernel1, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1} ;. diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll index 804b910dcd308..1d18e527e1466 100644 --- a/llvm/test/Transforms/OpenMP/global_constructor.ll +++ b/llvm/test/Transforms/OpenMP/global_constructor.ll @@ -10,7 +10,7 @@ @_ZL6Device = internal global double 0.000000e+00, align 8 @__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } -define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" { +define weak ptx_kernel void @__omp_offloading_fd02_85283c04_main_l11(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" { entry: %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment, ptr %dyn) #0 %exec_user_code = icmp eq i32 %0, -1 @@ -39,7 +39,7 @@ declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr declare void @__kmpc_target_deinit() local_unnamed_addr -define weak void @__omp_offloading__fd02_85283c04_Device_l6_ctor() "kernel" { +define weak ptx_kernel void @__omp_offloading__fd02_85283c04_Device_l6_ctor() "kernel" { entry: %call.i = tail call double @__nv_log(double noundef 2.000000e+00) #1 %call.i2 = tail call double @__nv_log(double noundef 2.000000e+00) #1 @@ -58,15 +58,12 @@ attributes #0 = { nounwind } attributes #1 = { convergent nounwind } !omp_offload.info = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4} !llvm.module.flags = !{!5, !6, !7, !8, !9} !llvm.ident = !{!10} !0 = !{i32 0, i32 64770, i32 -2060960764, !"__omp_offloading__fd02_85283c04_Device_l6_ctor", i32 6, i32 1} !1 = !{i32 0, i32 64770, i32 -2060960764, !"main", i32 11, i32 2} !2 = !{i32 1, !"_ZL6Device", i32 0, i32 0} -!3 = !{ptr @__omp_offloading__fd02_85283c04_Device_l6_ctor, !"kernel", i32 1} -!4 = !{ptr @__omp_offloading_fd02_85283c04_main_l11, !"kernel", i32 1} !5 = !{i32 1, !"wchar_size", i32 4} !6 = !{i32 7, !"openmp", i32 50} !7 = !{i32 7, !"openmp-device", i32 50} @@ -86,12 +83,12 @@ attributes #1 = { convergent nounwind } ; CHECK: common.ret: ; CHECK-NEXT: ret void ; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[TBAA11:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[TBAA9:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: -; CHECK-NEXT: store double [[TMP1]], ptr [[X]], align 8, !tbaa [[TBAA11]] +; CHECK-NEXT: store double [[TMP1]], ptr [[X]], align 8, !tbaa [[TBAA9]] ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: ; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1:[0-9]+]], i32 [[TMP2]]) #[[ATTR1]] @@ -105,6 +102,6 @@ attributes #1 = { convergent nounwind } ; CHECK-NEXT: [[CALL_I:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: [[CALL_I2:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR2]] ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[CALL_I]], [[CALL_I2]] -; CHECK-NEXT: store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[TBAA11]] +; CHECK-NEXT: store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[TBAA9]] ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll index 878ac9010a7dc..0f37b3e070acd 100644 --- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll @@ -13,7 +13,7 @@ target triple = "nvptx64" @S = external local_unnamed_addr global ptr @foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null } -define void @foo() "kernel" { +define ptx_kernel void @foo() "kernel" { entry: %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr null) %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !10 @@ -39,7 +39,6 @@ declare void @__kmpc_target_deinit() !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} -!nvvm.annotations = !{!7, !8} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c") @@ -48,7 +47,6 @@ declare void @__kmpc_target_deinit() !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"openmp", i32 50} !6 = !{i32 7, !"openmp-device", i32 50} -!7 = !{ptr @foo, !"kernel", i32 1} !8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !9 = !DISubroutineType(types: !2) !10 = !DILocation(line: 5, column: 7, scope: !8) diff --git a/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll index b029efbbe3c68..ce17ffcbb2084 100644 --- a/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll +++ b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll @@ -3,11 +3,11 @@ ; CHECK-DAG: remark: :0:0: OpenMP GPU kernel kernel1 ; CHECK-DAG: remark: :0:0: OpenMP GPU kernel kernel2 -define void @kernel1() "kernel" { +define ptx_kernel void @kernel1() "kernel" { ret void } -define void @kernel2() "kernel" { +define ptx_kernel void @kernel2() "kernel" { ret void } @@ -19,10 +19,5 @@ define void @non_kernel() { declare dso_local void @__kmpc_kernel_prepare_parallel(ptr) !llvm.module.flags = !{!4} -!nvvm.annotations = !{!2, !0, !1, !3, !1, !2} -!0 = !{ptr @kernel1, !"kernel", i32 1} -!1 = !{ptr @non_kernel, !"non_kernel", i32 1} -!2 = !{null, !"align", i32 1} -!3 = !{ptr @kernel2, !"kernel", i32 1} !4 = !{i32 7, !"openmp", i32 50} diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll index 936f7d1c46781..760c5a354a37c 100644 --- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll +++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll @@ -44,7 +44,7 @@ @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8 @__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } -define weak void @__omp_offloading_10301_87b2c_foo_l7() "kernel" { +define weak ptx_kernel void @__omp_offloading_10301_87b2c_foo_l7() "kernel" { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -173,10 +173,8 @@ entry: } !omp_offload.info = !{!0} -!nvvm.annotations = !{!1} !llvm.module.flags = !{!2, !3} !0 = !{i32 0, i32 66305, i32 555956, !"foo", i32 7, i32 0} -!1 = !{ptr @__omp_offloading_10301_87b2c_foo_l7, !"kernel", i32 1} !2 = !{i32 7, !"openmp", i32 50} !3 = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll index 310ac0a8296c3..2b3a7fabfb459 100644 --- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll @@ -18,7 +18,7 @@ target triple = "nvptx64" ; CHECK: @none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null } ; CHECK: @will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null } ;. -define weak void @is_spmd() "kernel" { +define weak ptx_kernel void @is_spmd() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@is_spmd ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @is_spmd_kernel_environment, ptr null) @@ -36,7 +36,7 @@ define weak void @is_spmd() "kernel" { ret void } -define weak void @will_be_spmd() "kernel" { +define weak ptx_kernel void @will_be_spmd() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@will_be_spmd ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: @@ -70,7 +70,7 @@ user_code.entry: ret void } -define weak void @non_spmd() "kernel" { +define weak ptx_kernel void @non_spmd() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@non_spmd ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null) @@ -88,7 +88,7 @@ define weak void @non_spmd() "kernel" { ret void } -define weak void @will_not_be_spmd() "kernel" { +define weak ptx_kernel void @will_not_be_spmd() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@will_not_be_spmd ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @will_not_be_spmd_kernel_environment, ptr null) @@ -207,14 +207,9 @@ declare void @foo() declare void @bar() !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3, !4, !5} !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} -!2 = !{ptr @is_spmd, !"kernel", i32 1} -!3 = !{ptr @will_be_spmd, !"kernel", i32 1} -!4 = !{ptr @non_spmd, !"kernel", i32 1} -!5 = !{ptr @will_not_be_spmd, !"kernel", i32 1} ;. ; CHECK: attributes #[[ATTR0]] = { "kernel" } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" } @@ -223,8 +218,4 @@ declare void @bar() ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META2:![0-9]+]] = !{ptr @is_spmd, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @will_be_spmd, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{ptr @non_spmd, !"kernel", i32 1} -; CHECK: [[META5:![0-9]+]] = !{ptr @will_not_be_spmd, !"kernel", i32 1} ;. diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll index 5c4386b24a3d5..1679a27fdae8b 100644 --- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll +++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll @@ -43,7 +43,7 @@ target triple = "nvptx64" ; CHECK: @__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ; CHECK: @__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ;. -define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" { +define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l13(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" { ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8 @@ -127,7 +127,7 @@ entry: declare void @__kmpc_target_deinit(ptr, i8) local_unnamed_addr -define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l16(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" { +define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l16(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" { ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8 @@ -315,13 +315,10 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #11 !omp_offload.info = !{!0, !1} -!nvvm.annotations = !{!2, !3} !llvm.module.flags = !{!4, !5} !0 = !{i32 0, i32 66306, i32 776160, !"main", i32 13, i32 0, i32 0} !1 = !{i32 0, i32 66306, i32 776160, !"main", i32 16, i32 0, i32 1} -!2 = !{ptr @__omp_offloading_10302_bd7e0_main_l13, !"kernel", i32 1} -!3 = !{ptr @__omp_offloading_10302_bd7e0_main_l16, !"kernel", i32 1} !4 = !{i32 7, !"openmp", i32 50} !5 = !{i32 7, !"openmp-device", i32 50} @@ -336,8 +333,6 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #11 ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 776160, !"main", i32 13, i32 0, i32 0} ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 66306, i32 776160, !"main", i32 16, i32 0, i32 1} -; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_10302_bd7e0_main_l13, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_10302_bd7e0_main_l16, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ;. diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll index fd6e7683af8e3..e5f65b26ed223 100644 --- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll +++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll @@ -16,7 +16,7 @@ target triple = "nvptx64" ; CHECK: @spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null } ; CHECK: @parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null } ;. -define weak void @none_spmd() "kernel" { +define weak ptx_kernel void @none_spmd() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@none_spmd ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null) @@ -32,7 +32,7 @@ define weak void @none_spmd() "kernel" { ret void } -define weak void @spmd() "kernel" { +define weak ptx_kernel void @spmd() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@spmd ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_kernel_environment, ptr null) @@ -48,7 +48,7 @@ define weak void @spmd() "kernel" { ret void } -define weak void @parallel() "kernel" { +define weak ptx_kernel void @parallel() "kernel" { ; CHECK-LABEL: define {{[^@]+}}@parallel ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(ptr @parallel_kernel_environment, ptr null) @@ -136,20 +136,13 @@ declare i32 @__kmpc_target_init(ptr, ptr) #1 declare void @__kmpc_target_deinit() #1 !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3, !4} !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} -!2 = !{ptr @none_spmd, !"kernel", i32 1} -!3 = !{ptr @spmd, !"kernel", i32 1} -!4 = !{ptr @parallel, !"kernel", i32 1} ;. ; CHECK: attributes #[[ATTR0]] = { "kernel" } ; CHECK: attributes #[[ATTR1]] = { alwaysinline } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META2:![0-9]+]] = !{ptr @none_spmd, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @spmd, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{ptr @parallel, !"kernel", i32 1} ;. diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll index 31e3ef2b9079f..29f2030c4d42b 100644 --- a/llvm/test/Transforms/OpenMP/remove_globalization.ll +++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll @@ -40,7 +40,7 @@ define weak i32 @__kmpc_target_init(ptr %0, ptr) { } declare void @__kmpc_target_deinit() -define void @kernel(ptr %dyn) "kernel" { +define ptx_kernel void @kernel(ptr %dyn) "kernel" { ; CHECK-LABEL: define {{[^@]+}}@kernel ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -98,14 +98,14 @@ define internal void @bar() { ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 -; CHECK-NEXT: call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG8:![0-9]+]] +; CHECK-NEXT: call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG7:![0-9]+]] ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@bar ; CHECK-DISABLED-SAME: () #[[ATTR1]] { ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: [[DOTH2S:%.*]] = alloca i8, i64 4, align 4 -; CHECK-DISABLED-NEXT: call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG8:![0-9]+]] +; CHECK-DISABLED-NEXT: call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG7:![0-9]+]] ; CHECK-DISABLED-NEXT: ret void ; entry: @@ -146,7 +146,7 @@ define void @unused() { ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@unused() { ; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]], !dbg [[DBG11:![0-9]+]] +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]], !dbg [[DBG10:![0-9]+]] ; CHECK-DISABLED-NEXT: call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR6]] ; CHECK-DISABLED-NEXT: ret void ; @@ -234,14 +234,12 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !6, !7} -!nvvm.annotations = !{!5} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c") !2 = !{} !3 = !{i32 2, !"Debug Info Version", i32 3} !4 = !{i32 1, !"wchar_size", i32 4} -!5 = !{ptr @kernel, !"kernel", i32 1} !6 = !{i32 7, !"openmp", i32 50} !7 = !{i32 7, !"openmp-device", i32 50} !8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) @@ -276,10 +274,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META7:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1} -; CHECK: [[DBG8]] = !DILocation(line: 4, column: 2, scope: [[META9:![0-9]+]]) -; CHECK: [[META9]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META10:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]]) -; CHECK: [[META10]] = !DISubroutineType(types: [[META2]]) +; CHECK: [[DBG7]] = !DILocation(line: 4, column: 2, scope: [[META8:![0-9]+]]) +; CHECK: [[META8]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META9:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]]) +; CHECK: [[META9]] = !DISubroutineType(types: [[META2]]) ;. ; CHECK-DISABLED: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None) ; CHECK-DISABLED: [[META1]] = !DIFile(filename: "remove_globalization.c", directory: {{.*}}) @@ -288,11 +285,10 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" ; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} ; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK-DISABLED: [[META7:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1} -; CHECK-DISABLED: [[DBG8]] = !DILocation(line: 4, column: 2, scope: [[META9:![0-9]+]]) -; CHECK-DISABLED: [[META9]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META10:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]]) -; CHECK-DISABLED: [[META10]] = !DISubroutineType(types: [[META2]]) -; CHECK-DISABLED: [[DBG11]] = !DILocation(line: 6, column: 2, scope: [[META9]]) +; CHECK-DISABLED: [[DBG7]] = !DILocation(line: 4, column: 2, scope: [[META8:![0-9]+]]) +; CHECK-DISABLED: [[META8]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META9:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]]) +; CHECK-DISABLED: [[META9]] = !DISubroutineType(types: [[META2]]) +; CHECK-DISABLED: [[DBG10]] = !DILocation(line: 6, column: 2, scope: [[META8]]) ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-REMARKS: {{.*}} diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll index 6e4fb9e57388b..92cfd75049226 100644 --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -25,7 +25,7 @@ target triple = "nvptx64" @baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } -define dso_local void @foo(ptr %dyn) "kernel" { +define dso_local ptx_kernel void @foo(ptr %dyn) "kernel" { entry: %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr %dyn) %x = call align 4 ptr @__kmpc_alloc_shared(i64 4) @@ -36,7 +36,7 @@ entry: ret void } -define void @bar(ptr %dyn) "kernel" { +define ptx_kernel void @bar(ptr %dyn) "kernel" { %c = call i32 @__kmpc_target_init(ptr @bar_kernel_environment, ptr %dyn) call void @unknown_no_openmp() %cmp = icmp eq i32 %c, -1 @@ -60,7 +60,7 @@ exit: ret void } -define void @baz_spmd(ptr %dyn) "kernel" { +define ptx_kernel void @baz_spmd(ptr %dyn) "kernel" { %c = call i32 @__kmpc_target_init(ptr @baz_kernel_environment, ptr %dyn) call void @unknown_no_openmp() %c0 = icmp eq i32 %c, -1 @@ -109,7 +109,6 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} -!nvvm.annotations = !{!7, !8, !13} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c") @@ -118,9 +117,6 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"openmp", i32 50} !6 = !{i32 7, !"openmp-device", i32 50} -!7 = !{ptr @foo, !"kernel", i32 1} -!8 = !{ptr @bar, !"kernel", i32 1} -!13 = !{ptr @baz_spmd, !"kernel", i32 1} !9 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !10 = !DISubroutineType(types: !2) !11 = !DILocation(line: 5, column: 7, scope: !9) @@ -177,7 +173,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" ; CHECK-NEXT: [[C0:%.*]] = icmp eq i32 [[C]], -1 ; CHECK-NEXT: br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]] ; CHECK: master3: -; CHECK-NEXT: [[Z:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 24) #[[ATTR6]], !dbg [[DBG10:![0-9]+]] +; CHECK-NEXT: [[Z:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 24) #[[ATTR6]], !dbg [[DBG7:![0-9]+]] ; CHECK-NEXT: call void @use.internalized(ptr nofree [[Z]]) #[[ATTR7]] ; CHECK-NEXT: call void @__kmpc_free_shared(ptr [[Z]], i64 24) #[[ATTR8]] ; CHECK-NEXT: br label [[EXIT]] @@ -231,12 +227,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META7:![0-9]+]] = !{ptr @foo, !"kernel", i32 1} -; CHECK: [[META8:![0-9]+]] = !{ptr @bar, !"kernel", i32 1} -; CHECK: [[META9:![0-9]+]] = !{ptr @baz_spmd, !"kernel", i32 1} -; CHECK: [[DBG10]] = !DILocation(line: 5, column: 14, scope: [[META11:![0-9]+]]) -; CHECK: [[META11]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META12:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]]) -; CHECK: [[META12]] = !DISubroutineType(types: [[META2]]) +; CHECK: [[DBG7]] = !DILocation(line: 5, column: 14, scope: [[META8:![0-9]+]]) +; CHECK: [[META8]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META9:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]]) +; CHECK: [[META9]] = !DISubroutineType(types: [[META2]]) ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-LIMIT: {{.*}} diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll index c186e5f04f092..70b9ce41c1a43 100644 --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -16,7 +16,7 @@ ; CHECK: [openmp-opt] Basic block @kernel if.then is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread. -define void @kernel(ptr %dyn) "kernel" { +define ptx_kernel void @kernel(ptr %dyn) "kernel" { %call = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr %dyn) %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.else @@ -116,7 +116,6 @@ attributes #0 = { cold noinline } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} -!nvvm.annotations = !{!7} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "single_threaded_execution.c", directory: "/tmp/single_threaded_execution.c") @@ -125,7 +124,6 @@ attributes #0 = { cold noinline } !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"openmp", i32 50} !6 = !{i32 7, !"openmp-device", i32 50} -!7 = !{ptr @kernel, !"kernel", i32 1} !8 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 8, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !9 = distinct !DISubprogram(name: "cold", scope: !1, file: !1, line: 8, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !10 = !DISubroutineType(types: !2) diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll index 6ff4b96b57556..983175382f0f0 100644 --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -105,36 +105,6 @@ @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null } -; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null } -; AMDGPU-DISABLED: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4 -; AMDGPU-DISABLED: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4 -; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; AMDGPU-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; AMDGPU-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null } -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null } -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null } -; NVPTX-DISABLED: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4 -; NVPTX-DISABLED: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4 -; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; NVPTX-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; NVPTX-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ;. ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 @@ -226,7 +196,7 @@ ; NVPTX-DISABLED2: @__omp_outlined__7_wrapper.ID = private constant i8 undef ; NVPTX-DISABLED2: @__omp_outlined__9_wrapper.ID = private constant i8 undef ;. -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 { +define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 ; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] { ; AMDGPU-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() @@ -256,15 +226,6 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 { ; NVPTX-DISABLED2-SAME: () #[[ATTR0:[0-9]+]] { ; NVPTX-DISABLED2-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 -; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { -; AMDGPU-DISABLED-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 -; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { -; NVPTX-DISABLED-NEXT: call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() -; NVPTX-DISABLED-NEXT: ret void call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ret void } @@ -282,7 +243,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -299,7 +260,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -350,7 +311,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -401,7 +362,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -451,7 +412,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -501,114 +462,10 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug() ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug -; AMDGPU-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment) -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__1_wrapper.ID -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug -; NVPTX-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment) -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__1_wrapper.ID -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -643,10 +500,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__ ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -661,10 +518,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__ ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -679,10 +536,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__ ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -697,10 +554,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__ ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -715,10 +572,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__ ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -733,45 +590,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %. ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7:[0-9]+]] -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] entry: %captured_vars_addrs = alloca [0 x ptr], align 8 br label %for.cond @@ -829,17 +651,6 @@ define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: entry: ; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] -; NVPTX-DISABLED-NEXT: ret void entry: call void @unknown() #11 ret void @@ -906,25 +717,6 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: ret void entry: %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 @@ -937,7 +729,7 @@ entry: } ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 { +define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: @@ -950,7 +742,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2 ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -967,7 +759,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2 ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -1018,7 +810,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2 ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1069,7 +861,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2 ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -1119,7 +911,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2 ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1169,114 +961,10 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2 ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 -; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment) -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__3_wrapper.ID -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 -; NVPTX-DISABLED-SAME: () #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment) -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__3_wrapper.ID -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -1314,10 +1002,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -1334,10 +1022,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -1355,10 +1043,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -1376,10 +1064,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -1396,10 +1084,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -1416,50 +1104,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 -; AMDGPU-DISABLED-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr -; AMDGPU-DISABLED-NEXT: call void @use(ptr nocapture [[MALLOC_CAST]]) #[[ATTR7]] -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[X_H2S:%.*]] = alloca i8, i64 4, align 4 -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 -; NVPTX-DISABLED-NEXT: call void @use(ptr nocapture [[X_H2S]]) #[[ATTR7]] -; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] entry: %captured_vars_addrs = alloca [0 x ptr], align 8 %x = call align 4 ptr @__kmpc_alloc_shared(i64 4) @@ -1519,17 +1167,6 @@ define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: entry: ; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; NVPTX-DISABLED-NEXT: ret void entry: call void @unknown() #11 ret void @@ -1596,25 +1233,6 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: ret void entry: %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 @@ -1628,7 +1246,7 @@ entry: ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 { +define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: @@ -1641,7 +1259,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -1658,7 +1276,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -1709,7 +1327,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1760,7 +1378,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -1810,7 +1428,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -1860,114 +1478,10 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 -; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment) -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 -; NVPTX-DISABLED-SAME: () #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment) -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -2002,11 +1516,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -2021,11 +1535,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -2040,11 +1554,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -2059,11 +1573,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -2078,11 +1592,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -2097,48 +1611,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]] -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] entry: %captured_vars_addrs = alloca [1 x ptr], align 8 %x = call align 4 ptr @__kmpc_alloc_shared(i64 4) @@ -2167,73 +1644,56 @@ define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-DISABLED1-NEXT: entry: -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-DISABLED1-NEXT: ret void ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-DISABLED2-NEXT: entry: -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-DISABLED2-NEXT: ret void ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-DISABLED1-NEXT: entry: -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED1-NEXT: ret void ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-DISABLED2-NEXT: entry: -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; NVPTX-DISABLED-NEXT: ret void entry: %0 = load i32, ptr %x, align 4, !tbaa !18 %inc = add nsw i32 %0, 1 @@ -2252,7 +1712,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; @@ -2264,7 +1724,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; @@ -2276,7 +1736,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void ; @@ -2288,7 +1748,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void ; @@ -2300,7 +1760,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void ; @@ -2312,32 +1772,9 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: ret void entry: %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 @@ -2352,7 +1789,7 @@ entry: } ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 { +define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: @@ -2365,7 +1802,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g ; AMDGPU-NEXT: ret void ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -2382,7 +1819,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g ; NVPTX-NEXT: ret void ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -2433,7 +1870,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: user_code.entry: ; AMDGPU-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -2484,7 +1921,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: user_code.entry: ; AMDGPU-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-DISABLED2-NEXT: br label [[COMMON_RET]] @@ -2534,7 +1971,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: user_code.entry: ; NVPTX-DISABLED1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED1-NEXT: br label [[COMMON_RET]] @@ -2584,114 +2021,10 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: user_code.entry: ; NVPTX-DISABLED2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 -; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment) -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 -; NVPTX-DISABLED-SAME: () #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment) -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -2723,7 +2056,7 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; AMDGPU-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; AMDGPU: region.guarded: -; AMDGPU-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: br label [[REGION_GUARDED_END:%.*]] ; AMDGPU: region.guarded.end: ; AMDGPU-NEXT: br label [[REGION_BARRIER]] @@ -2740,11 +2073,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { @@ -2756,7 +2089,7 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; NVPTX-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; NVPTX: region.guarded: -; NVPTX-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: br label [[REGION_GUARDED_END:%.*]] ; NVPTX: region.guarded.end: ; NVPTX-NEXT: br label [[REGION_BARRIER]] @@ -2773,17 +2106,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { ; AMDGPU-DISABLED1-NEXT: entry: ; AMDGPU-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; AMDGPU-DISABLED1-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU-DISABLED1: for.cond: ; AMDGPU-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] @@ -2793,17 +2126,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED1-NEXT: ret void ; AMDGPU-DISABLED1: for.body: -; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; AMDGPU-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { ; AMDGPU-DISABLED2-NEXT: entry: ; AMDGPU-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; AMDGPU-DISABLED2-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU-DISABLED2: for.cond: ; AMDGPU-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] @@ -2813,17 +2146,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; AMDGPU-DISABLED2-NEXT: ret void ; AMDGPU-DISABLED2: for.body: -; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; AMDGPU-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { ; NVPTX-DISABLED1-NEXT: entry: ; NVPTX-DISABLED1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; NVPTX-DISABLED1-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: br label [[FOR_COND:%.*]] ; NVPTX-DISABLED1: for.cond: ; NVPTX-DISABLED1-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] @@ -2833,17 +2166,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED1-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED1-NEXT: ret void ; NVPTX-DISABLED1: for.body: -; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED1-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; NVPTX-DISABLED1-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { ; NVPTX-DISABLED2-NEXT: entry: ; NVPTX-DISABLED2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; NVPTX-DISABLED2-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: br label [[FOR_COND:%.*]] ; NVPTX-DISABLED2: for.cond: ; NVPTX-DISABLED2-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] @@ -2853,50 +2186,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void ; NVPTX-DISABLED2: for.body: -; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]] +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-DISABLED2-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; AMDGPU-DISABLED-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8 -; NVPTX-DISABLED-NEXT: store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; NVPTX-DISABLED2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] entry: %captured_vars_addrs = alloca [1 x ptr], align 8 %x = call align 4 ptr @__kmpc_alloc_shared(i64 4) @@ -2926,73 +2220,56 @@ define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-DISABLED1-NEXT: entry: -; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED1-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] ; AMDGPU-DISABLED1-NEXT: ret void ; ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-DISABLED2-NEXT: entry: -; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-DISABLED2-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] ; AMDGPU-DISABLED2-NEXT: ret void ; ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-DISABLED1-NEXT: entry: -; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED1-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED1-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] ; NVPTX-DISABLED1-NEXT: ret void ; ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-DISABLED2-NEXT: entry: -; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED2-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-DISABLED2-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @unknowni32p(ptr [[X]]) #[[ATTR8]] -; NVPTX-DISABLED-NEXT: ret void entry: %0 = load i32, ptr %x, align 4, !tbaa !18 %inc = add nsw i32 %0, 1 @@ -3011,7 +2288,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; @@ -3023,7 +2300,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; @@ -3035,7 +2312,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; AMDGPU-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; AMDGPU-DISABLED1-NEXT: ret void ; @@ -3047,7 +2324,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; AMDGPU-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; AMDGPU-DISABLED2-NEXT: ret void ; @@ -3059,7 +2336,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED1-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; NVPTX-DISABLED1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-DISABLED1-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; NVPTX-DISABLED1-NEXT: ret void ; @@ -3071,32 +2348,9 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] +; NVPTX-DISABLED2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: ret void entry: %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 @@ -3111,7 +2365,7 @@ entry: } ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 { +define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: @@ -3384,96 +2638,6 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 -; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment) -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR5]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 -; NVPTX-DISABLED-SAME: () #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment) -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR5]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 @@ -3530,24 +2694,13 @@ define internal void @__omp_outlined__8(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: entry: ; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; NVPTX-DISABLED-NEXT: ret void entry: call void @unknown() #11 ret void } ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 { +define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: @@ -3862,110 +3015,6 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 { ; NVPTX-DISABLED2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-DISABLED2-NEXT: call void @__kmpc_target_deinit() ; NVPTX-DISABLED2-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 -; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment) -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit() -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 -; NVPTX-DISABLED-SAME: () #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment) -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]] -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0) -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit() -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] entry: %captured_vars_addrs = alloca [0 x ptr], align 8 %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null) @@ -4021,17 +3070,6 @@ define internal void @.omp_outlined.(i32 %.global_tid., ptr noalias %.part_id., ; NVPTX-DISABLED2-NEXT: entry: ; NVPTX-DISABLED2-NEXT: call void @spmd_amenable() #[[ATTR7]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined. -; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined. -; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] -; NVPTX-DISABLED-NEXT: ret void entry: call void @spmd_amenable() #10 ret void @@ -4093,13 +3131,6 @@ define weak i32 @__kmpc_target_init(ptr, ptr) { ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init ; NVPTX-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) { ; NVPTX-DISABLED2-NEXT: ret i32 0 -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init -; AMDGPU-DISABLED-SAME: (ptr [[TMP0:%.*]]) { -; AMDGPU-DISABLED-NEXT: ret i32 0 -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init -; NVPTX-DISABLED-SAME: (ptr [[TMP0:%.*]]) { -; NVPTX-DISABLED-NEXT: ret i32 0 ret i32 0 } @@ -4158,16 +3189,6 @@ define internal void @__omp_outlined__9(ptr noalias %.global_tid., ptr noalias % ; NVPTX-DISABLED2-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED2-NEXT: ret void ; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 -; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 -; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] -; NVPTX-DISABLED-NEXT: ret void entry: call void @unknown() #11 ret void @@ -4234,25 +3255,6 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-DISABLED2-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-DISABLED2-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED2-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] -; AMDGPU-DISABLED-NEXT: ret void -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 -; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]] -; NVPTX-DISABLED-NEXT: ret void entry: %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 @@ -4280,7 +3282,6 @@ attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" } attributes #11 = { convergent } !omp_offload.info = !{!0, !1, !2, !3, !4, !5} -!nvvm.annotations = !{!6, !7, !8, !9, !10, !11} !llvm.module.flags = !{!12, !13, !14, !15, !16} !llvm.ident = !{!17} @@ -4290,12 +3291,6 @@ attributes #11 = { convergent } !3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} !4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} !5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -!6 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -!7 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -!8 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -!9 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -!10 = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -!11 = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} !12 = !{i32 1, !"wchar_size", i32 4} !13 = !{i32 7, !"openmp", i32 50} !14 = !{i32 7, !"openmp-device", i32 50} @@ -4317,92 +3312,6 @@ attributes #11 = { convergent } !30 = !{!31, !27, i64 0} !31 = !{!"kmp_task_t_with_privates", !32, i64 0} !32 = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} -; AMDGPU-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR1]] = { norecurse } -; AMDGPU-DISABLED: attributes #[[ATTR2]] = { convergent norecurse nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR3]] = { alwaysinline norecurse nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR4]] = { alwaysinline convergent nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR5]] = { nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR6:[0-9]+]] = { nosync nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR7:[0-9]+]] = { nofree nosync nounwind allocsize(0) } -; AMDGPU-DISABLED: attributes #[[ATTR8]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; AMDGPU-DISABLED: attributes #[[ATTR9]] = { convergent } -; AMDGPU-DISABLED: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -; AMDGPU-DISABLED: attributes #[[ATTR11:[0-9]+]] = { alwaysinline } -; AMDGPU-DISABLED: attributes #[[ATTR12:[0-9]+]] = { convergent nounwind } -; NVPTX-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind } -; NVPTX-DISABLED: attributes #[[ATTR1]] = { norecurse } -; NVPTX-DISABLED: attributes #[[ATTR2]] = { convergent norecurse nounwind } -; NVPTX-DISABLED: attributes #[[ATTR3]] = { alwaysinline norecurse nounwind } -; NVPTX-DISABLED: attributes #[[ATTR4]] = { alwaysinline convergent nounwind } -; NVPTX-DISABLED: attributes #[[ATTR5]] = { nounwind } -; NVPTX-DISABLED: attributes #[[ATTR6:[0-9]+]] = { nosync nounwind } -; NVPTX-DISABLED: attributes #[[ATTR7:[0-9]+]] = { nofree nosync nounwind allocsize(0) } -; NVPTX-DISABLED: attributes #[[ATTR8]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; NVPTX-DISABLED: attributes #[[ATTR9]] = { convergent } -; NVPTX-DISABLED: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -; NVPTX-DISABLED: attributes #[[ATTR11:[0-9]+]] = { alwaysinline } -; NVPTX-DISABLED: attributes #[[ATTR12:[0-9]+]] = { convergent nounwind } -; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"} -; AMDGPU-DISABLED: [[TBAA18]] = !{!19, !19, i64 0} -; AMDGPU-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0} -; AMDGPU-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0} -; AMDGPU-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"} -; AMDGPU-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24} -; AMDGPU-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; AMDGPU-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24} -; AMDGPU-DISABLED: [[TBAA26]] = !{!27, !27, i64 0} -; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} -; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24} -; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24} -; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX-DISABLED: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; NVPTX-DISABLED: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; NVPTX-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; NVPTX-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; NVPTX-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; NVPTX-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX-DISABLED: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"} -; NVPTX-DISABLED: [[TBAA18]] = !{!19, !19, i64 0} -; NVPTX-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0} -; NVPTX-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0} -; NVPTX-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"} -; NVPTX-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24} -; NVPTX-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; NVPTX-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"} -; NVPTX-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24} -; NVPTX-DISABLED: [[TBAA26]] = !{!27, !27, i64 0} -; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} -; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24} -; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24} ;. ; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" } ; AMDGPU: attributes #[[ATTR1]] = { norecurse } @@ -4488,30 +3397,24 @@ attributes #11 = { convergent } ; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; AMDGPU: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; AMDGPU: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; AMDGPU: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; AMDGPU: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; AMDGPU: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; AMDGPU: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; AMDGPU: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; AMDGPU: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} -; AMDGPU: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0} -; AMDGPU: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} -; AMDGPU: [[META21]] = !{!"Simple C/C++ TBAA"} -; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]} -; AMDGPU: [[META23]] = !{!"llvm.loop.mustprogress"} -; AMDGPU: [[META24]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]} -; AMDGPU: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} -; AMDGPU: [[META27]] = !{!"any pointer", [[META20]], i64 0} -; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]} -; AMDGPU: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]} +; AMDGPU: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"} +; AMDGPU: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"} +; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"} +; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -4519,30 +3422,24 @@ attributes #11 = { convergent } ; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; NVPTX: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; NVPTX: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; NVPTX: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; NVPTX: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; NVPTX: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; NVPTX: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; NVPTX: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; NVPTX: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} -; NVPTX: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0} -; NVPTX: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} -; NVPTX: [[META21]] = !{!"Simple C/C++ TBAA"} -; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]} -; NVPTX: [[META23]] = !{!"llvm.loop.mustprogress"} -; NVPTX: [[META24]] = !{!"llvm.loop.unroll.disable"} -; NVPTX: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]} -; NVPTX: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} -; NVPTX: [[META27]] = !{!"any pointer", [[META20]], i64 0} -; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]} -; NVPTX: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]} +; NVPTX: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"} +; NVPTX: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"} +; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"} +; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. ; AMDGPU-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; AMDGPU-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -4550,30 +3447,24 @@ attributes #11 = { convergent } ; AMDGPU-DISABLED1: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; AMDGPU-DISABLED1: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; AMDGPU-DISABLED1: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU-DISABLED1: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; AMDGPU-DISABLED1: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; AMDGPU-DISABLED1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; AMDGPU-DISABLED1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; AMDGPU-DISABLED1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; AMDGPU-DISABLED1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; AMDGPU-DISABLED1: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU-DISABLED1: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; AMDGPU-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; AMDGPU-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} -; AMDGPU-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0} -; AMDGPU-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} -; AMDGPU-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"} -; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]} -; AMDGPU-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"} -; AMDGPU-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]} -; AMDGPU-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} -; AMDGPU-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0} -; AMDGPU-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]} -; AMDGPU-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]} +; AMDGPU-DISABLED1: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU-DISABLED1: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU-DISABLED1: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU-DISABLED1: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; AMDGPU-DISABLED1: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; AMDGPU-DISABLED1: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; AMDGPU-DISABLED1: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; AMDGPU-DISABLED1: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; AMDGPU-DISABLED1: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; AMDGPU-DISABLED1: [[META15]] = !{!"Simple C/C++ TBAA"} +; AMDGPU-DISABLED1: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; AMDGPU-DISABLED1: [[META17]] = !{!"llvm.loop.mustprogress"} +; AMDGPU-DISABLED1: [[META18]] = !{!"llvm.loop.unroll.disable"} +; AMDGPU-DISABLED1: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; AMDGPU-DISABLED1: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; AMDGPU-DISABLED1: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; AMDGPU-DISABLED1: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. ; AMDGPU-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; AMDGPU-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -4581,30 +3472,24 @@ attributes #11 = { convergent } ; AMDGPU-DISABLED2: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; AMDGPU-DISABLED2: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; AMDGPU-DISABLED2: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU-DISABLED2: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; AMDGPU-DISABLED2: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; AMDGPU-DISABLED2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; AMDGPU-DISABLED2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; AMDGPU-DISABLED2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; AMDGPU-DISABLED2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; AMDGPU-DISABLED2: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU-DISABLED2: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; AMDGPU-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; AMDGPU-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} -; AMDGPU-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0} -; AMDGPU-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} -; AMDGPU-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"} -; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]} -; AMDGPU-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"} -; AMDGPU-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]} -; AMDGPU-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} -; AMDGPU-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0} -; AMDGPU-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]} -; AMDGPU-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]} +; AMDGPU-DISABLED2: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU-DISABLED2: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU-DISABLED2: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU-DISABLED2: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; AMDGPU-DISABLED2: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; AMDGPU-DISABLED2: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; AMDGPU-DISABLED2: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; AMDGPU-DISABLED2: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; AMDGPU-DISABLED2: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; AMDGPU-DISABLED2: [[META15]] = !{!"Simple C/C++ TBAA"} +; AMDGPU-DISABLED2: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; AMDGPU-DISABLED2: [[META17]] = !{!"llvm.loop.mustprogress"} +; AMDGPU-DISABLED2: [[META18]] = !{!"llvm.loop.unroll.disable"} +; AMDGPU-DISABLED2: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; AMDGPU-DISABLED2: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; AMDGPU-DISABLED2: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; AMDGPU-DISABLED2: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. ; NVPTX-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; NVPTX-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -4612,30 +3497,24 @@ attributes #11 = { convergent } ; NVPTX-DISABLED1: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; NVPTX-DISABLED1: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; NVPTX-DISABLED1: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX-DISABLED1: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; NVPTX-DISABLED1: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; NVPTX-DISABLED1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; NVPTX-DISABLED1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; NVPTX-DISABLED1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; NVPTX-DISABLED1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; NVPTX-DISABLED1: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX-DISABLED1: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; NVPTX-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; NVPTX-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} -; NVPTX-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0} -; NVPTX-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} -; NVPTX-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"} -; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]} -; NVPTX-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"} -; NVPTX-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"} -; NVPTX-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]} -; NVPTX-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} -; NVPTX-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0} -; NVPTX-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]} -; NVPTX-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]} +; NVPTX-DISABLED1: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX-DISABLED1: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX-DISABLED1: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX-DISABLED1: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; NVPTX-DISABLED1: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; NVPTX-DISABLED1: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; NVPTX-DISABLED1: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; NVPTX-DISABLED1: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; NVPTX-DISABLED1: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; NVPTX-DISABLED1: [[META15]] = !{!"Simple C/C++ TBAA"} +; NVPTX-DISABLED1: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; NVPTX-DISABLED1: [[META17]] = !{!"llvm.loop.mustprogress"} +; NVPTX-DISABLED1: [[META18]] = !{!"llvm.loop.unroll.disable"} +; NVPTX-DISABLED1: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; NVPTX-DISABLED1: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; NVPTX-DISABLED1: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; NVPTX-DISABLED1: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. ; NVPTX-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} ; NVPTX-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -4643,28 +3522,22 @@ attributes #11 = { convergent } ; NVPTX-DISABLED2: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; NVPTX-DISABLED2: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; NVPTX-DISABLED2: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX-DISABLED2: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; NVPTX-DISABLED2: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; NVPTX-DISABLED2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; NVPTX-DISABLED2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; NVPTX-DISABLED2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; NVPTX-DISABLED2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; NVPTX-DISABLED2: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX-DISABLED2: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; NVPTX-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; NVPTX-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} -; NVPTX-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0} -; NVPTX-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} -; NVPTX-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"} -; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]} -; NVPTX-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"} -; NVPTX-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"} -; NVPTX-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]} -; NVPTX-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} -; NVPTX-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0} -; NVPTX-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]} -; NVPTX-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]} +; NVPTX-DISABLED2: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX-DISABLED2: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX-DISABLED2: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX-DISABLED2: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; NVPTX-DISABLED2: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; NVPTX-DISABLED2: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; NVPTX-DISABLED2: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; NVPTX-DISABLED2: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; NVPTX-DISABLED2: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; NVPTX-DISABLED2: [[META15]] = !{!"Simple C/C++ TBAA"} +; NVPTX-DISABLED2: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; NVPTX-DISABLED2: [[META17]] = !{!"llvm.loop.mustprogress"} +; NVPTX-DISABLED2: [[META18]] = !{!"llvm.loop.unroll.disable"} +; NVPTX-DISABLED2: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; NVPTX-DISABLED2: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; NVPTX-DISABLED2: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; NVPTX-DISABLED2: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll index 2f43a4e4286a2..99715cf5b4032 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll @@ -28,7 +28,7 @@ target triple = "nvptx64" ; CHECK: @__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 ;. -define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 { +define weak ptx_kernel void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5 ; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -47,7 +47,7 @@ define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull a ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: -; CHECK-NEXT: store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[TBAA8:![0-9]+]] +; CHECK-NEXT: store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[TBAA7:![0-9]+]] ; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] @@ -127,12 +127,10 @@ attributes #5 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "st attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" } !omp_offload.info = !{!0} -!nvvm.annotations = !{!1} !llvm.module.flags = !{!2, !3, !4, !5, !6} !llvm.ident = !{!7} !0 = !{i32 0, i32 64770, i32 1078211522, !"main", i32 5, i32 0} -!1 = !{ptr @__omp_offloading_fd02_404433c2_main_l5, !"kernel", i32 1} !2 = !{i32 1, !"wchar_size", i32 4} !3 = !{i32 7, !"openmp", i32 50} !4 = !{i32 7, !"openmp-device", i32 50} @@ -154,15 +152,14 @@ attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" } ; CHECK: attributes #[[ATTR7]] = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 1078211522, !"main", i32 5, i32 0} -; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_404433c2_main_l5, !"kernel", i32 1} -; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0} -; CHECK: [[META9]] = !{!"double", [[META10:![0-9]+]], i64 0} -; CHECK: [[META10]] = !{!"omnipotent char", [[META11:![0-9]+]], i64 0} -; CHECK: [[META11]] = !{!"Simple C/C++ TBAA"} +; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0} +; CHECK: [[META8]] = !{!"double", [[META9:![0-9]+]], i64 0} +; CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0} +; CHECK: [[META10]] = !{!"Simple C/C++ TBAA"} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll index 75e01f3295fe2..953ecb2ddd8a6 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll @@ -297,12 +297,10 @@ attributes #14 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier,ompx_ attributes #15 = { convergent nounwind } !omp_offload.info = !{!0} -!nvvm.annotations = !{!1} !llvm.module.flags = !{!2, !3, !4, !5} !llvm.ident = !{!6} !0 = !{i32 0, i32 32, i32 18757968, !"main", i32 12, i32 0} -!1 = !{ptr @__omp_offloading_20_11e3950_main_l12, !"kernel", i32 1} !2 = !{i32 1, !"wchar_size", i32 4} !3 = !{i32 7, !"openmp", i32 50} !4 = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll index 229a49d784559..bbf1de253de92 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -58,7 +58,7 @@ target triple = "nvptx64" ; CHECK-DISABLED: @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ; CHECK-DISABLED: @__omp_outlined__1_wrapper.ID = private constant i8 undef ;. -define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x, i64 %N) #0 { +define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x, i64 %N) #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2a_fbfa7a_sequential_loop_l6 ; CHECK-SAME: (ptr [[DYN:%.*]], ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -84,9 +84,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: -; CHECK-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META8:![0-9]+]] -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META8]] -; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META8]] +; CHECK-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] +; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] @@ -111,7 +111,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] ; CHECK: region.guarded4: -; CHECK-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META8]] +; CHECK-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END1:%.*]] ; CHECK: region.guarded.end1: ; CHECK-NEXT: br label [[REGION_BARRIER2]] @@ -120,10 +120,10 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: br label [[REGION_EXIT3]] ; CHECK: region.exit3: ; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: __omp_outlined__.exit: ; CHECK-NEXT: call void @__kmpc_parallel_51(ptr null, i32 0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr null, i64 0) -; CHECK-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META8]] +; CHECK-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]] ; CHECK-NEXT: [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64 ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID10:%.*]] @@ -132,7 +132,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] ; CHECK: region.guarded9: -; CHECK-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META8]] +; CHECK-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END6:%.*]] ; CHECK: region.guarded.end6: ; CHECK-NEXT: br label [[REGION_BARRIER7]] @@ -140,7 +140,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]]) ; CHECK-NEXT: br label [[REGION_EXIT8:%.*]] ; CHECK: region.exit8: -; CHECK-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] +; CHECK-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-NEXT: [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID15:%.*]] @@ -149,7 +149,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] ; CHECK: region.guarded14: -; CHECK-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META8]] +; CHECK-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END11:%.*]] ; CHECK: region.guarded.end11: ; CHECK-NEXT: br label [[REGION_BARRIER12]] @@ -157,7 +157,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]]) ; CHECK-NEXT: br label [[REGION_EXIT13:%.*]] ; CHECK: region.exit13: -; CHECK-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] +; CHECK-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-NEXT: [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64 ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID20:%.*]] @@ -166,7 +166,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 ; CHECK-NEXT: br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] ; CHECK: region.guarded19: -; CHECK-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META8]] +; CHECK-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END16:%.*]] ; CHECK: region.guarded.end16: ; CHECK-NEXT: br label [[REGION_BARRIER17]] @@ -174,9 +174,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]]) ; CHECK-NEXT: br label [[REGION_EXIT18:%.*]] ; CHECK: region.exit18: -; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] -; CHECK-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] -; CHECK-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] +; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] +; CHECK-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] +; CHECK-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-NEXT: call void @__kmpc_target_deinit() #[[ATTR6]] ; CHECK-NEXT: ret void ; CHECK: worker.exit: @@ -230,13 +230,13 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-DISABLED-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr) ; CHECK-DISABLED-NEXT: store ptr [[SELECT]], ptr [[LOC]], align 8 ; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] -; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META8:![0-9]+]] +; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] ; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 -; CHECK-DISABLED-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META8]] +; CHECK-DISABLED-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32 ; CHECK-DISABLED-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32 ; CHECK-DISABLED-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]] -; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META8]] +; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: call void @usei8ptr(ptr nocapture [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]] ; CHECK-DISABLED-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK-DISABLED: for.cond.i: @@ -248,26 +248,26 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x ; CHECK-DISABLED-NEXT: [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1 ; CHECK-DISABLED-NEXT: [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] -; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META8]] +; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -; CHECK-DISABLED-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-DISABLED-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-DISABLED: __omp_outlined__.exit: ; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(ptr null, i32 0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr null, i64 0) -; CHECK-DISABLED-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META8]] +; CHECK-DISABLED-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META8]] -; CHECK-DISABLED-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] +; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META8]] -; CHECK-DISABLED-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] +; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META8]] -; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] -; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] -; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]] +; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit() #[[ATTR6]] ; CHECK-DISABLED-NEXT: ret void ; CHECK-DISABLED: worker.exit: @@ -404,12 +404,10 @@ attributes #4 = { inaccessiblememonly nofree nosync nounwind willreturn } attributes #5 = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_amenable" } !omp_offload.info = !{!0} -!nvvm.annotations = !{!1} !llvm.module.flags = !{!2, !3, !4, !5, !6} !llvm.ident = !{!7} !0 = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0} -!1 = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1} !2 = !{i32 1, !"wchar_size", i32 4} !3 = !{i32 7, !"openmp", i32 50} !4 = !{i32 7, !"openmp-device", i32 50} @@ -447,30 +445,28 @@ attributes #5 = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_ame ; CHECK-DISABLED: attributes #[[ATTR10]] = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_amenable" } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0} -; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1} -; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; CHECK: [[META8]] = !{[[META9:![0-9]+]]} -; CHECK: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"__omp_outlined__: %__context"} -; CHECK: [[META10]] = distinct !{[[META10]], !"__omp_outlined__"} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]} -; CHECK: [[META12]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK: [[META7]] = !{[[META8:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]], !"__omp_outlined__: %__context"} +; CHECK: [[META9]] = distinct !{[[META9]], !"__omp_outlined__"} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]]} +; CHECK: [[META11]] = !{!"llvm.loop.mustprogress"} ;. ; CHECK-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0} -; CHECK-DISABLED: [[META1:![0-9]+]] = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1} -; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK-DISABLED: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; CHECK-DISABLED: [[META8]] = !{[[META9:![0-9]+]]} -; CHECK-DISABLED: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"__omp_outlined__: %__context"} -; CHECK-DISABLED: [[META10]] = distinct !{[[META10]], !"__omp_outlined__"} -; CHECK-DISABLED: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]} -; CHECK-DISABLED: [[META12]] = !{!"llvm.loop.mustprogress"} +; CHECK-DISABLED: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK-DISABLED: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK-DISABLED: [[META7]] = !{[[META8:![0-9]+]]} +; CHECK-DISABLED: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]], !"__omp_outlined__: %__context"} +; CHECK-DISABLED: [[META9]] = distinct !{[[META9]], !"__omp_outlined__"} +; CHECK-DISABLED: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]]} +; CHECK-DISABLED: [[META11]] = !{!"llvm.loop.mustprogress"} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll index 11405b7eb447c..a644fe1b2f821 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll @@ -56,7 +56,7 @@ target triple = "nvptx64" ; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ;. -define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) "kernel" #0 { +define weak ptx_kernel void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) "kernel" #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -113,7 +113,7 @@ define weak i32 @__kmpc_target_init(ptr, ptr) { declare void @__kmpc_target_deinit() ; Function Attrs: convergent noinline norecurse nounwind -define weak void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: @@ -321,14 +321,12 @@ attributes #4 = { alwaysinline } attributes #5 = { convergent } !omp_offload.info = !{!0, !1} -!nvvm.annotations = !{!2, !3} !llvm.module.flags = !{!4, !5, !6, !7, !8} !llvm.ident = !{!9} !0 = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} !1 = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} -!2 = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1} -!3 = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1} + !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"openmp", i32 50} !6 = !{i32 7, !"openmp-device", i32 50} @@ -358,23 +356,19 @@ attributes #5 = { convergent } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} -; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} ;. ; CHECK-DISABLE-SPMDIZATION: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} ; CHECK-DISABLE-SPMDIZATION: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} -; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1} -; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1} -; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK-DISABLE-SPMDIZATION: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll index f348825446c63..6dfc14e9270ed 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll @@ -28,7 +28,7 @@ ; NVPTX: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ; NVPTX: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ;. -define weak void @spmd_callees(i1 %c) #0 { +define weak ptx_kernel void @spmd_callees(i1 %c) #0 { ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees ; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] { ; AMDGPU-NEXT: call void @spmd_callees__debug(i1 [[C]]) @@ -57,7 +57,7 @@ define internal void @spmd_callees__debug(i1 %c) { ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]] ; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17:![0-9]+]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; AMDGPU-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 ; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 ; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] @@ -88,7 +88,7 @@ define internal void @spmd_callees__debug(i1 %c) { ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]] ; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17:![0-9]+]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]] ; NVPTX-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2 ; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2 ; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] @@ -143,10 +143,10 @@ define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., p ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { @@ -161,10 +161,10 @@ define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., p ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] ; entry: %captured_vars_addrs = alloca [0 x ptr], align 8 @@ -262,10 +262,10 @@ define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., p ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { @@ -282,10 +282,10 @@ define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., p ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] ; entry: %captured_vars_addrs = alloca [0 x ptr], align 8 @@ -367,7 +367,7 @@ entry: ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @spmd_and_non_spmd_callee(i1 %c) #0 { +define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 { ; ; ; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee @@ -413,7 +413,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 { ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] ; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable ; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable ; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] @@ -473,7 +473,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 { ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] ; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable ; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable ; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] @@ -530,11 +530,11 @@ define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., p ; AMDGPU-NEXT: call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA25:![0-9]+]] -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { @@ -551,11 +551,11 @@ define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., p ; NVPTX-NEXT: call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA25:![0-9]+]] -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] ; entry: %captured_vars_addrs = alloca [1 x ptr], align 8 @@ -587,18 +587,18 @@ define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias % ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @unknown() #[[ATTR7]] ; NVPTX-NEXT: ret void ; @@ -622,7 +622,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA25]] +; AMDGPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; AMDGPU-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]] ; AMDGPU-NEXT: ret void ; @@ -634,7 +634,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) ; NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA25]] +; NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]] ; NVPTX-NEXT: call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]] ; NVPTX-NEXT: ret void ; @@ -652,7 +652,7 @@ entry: } ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @spmd_callees_metadata(ptr %fp) #0 { +define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 { ; ; ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees_metadata @@ -668,7 +668,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 { ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] ; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) ; AMDGPU-NEXT: call void @__kmpc_target_deinit() ; AMDGPU-NEXT: br label [[COMMON_RET]] @@ -686,7 +686,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 { ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] ; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) ; NVPTX-NEXT: call void @__kmpc_target_deinit() ; NVPTX-NEXT: br label [[COMMON_RET]] @@ -711,7 +711,7 @@ user_code.entry: ; preds = %entry } ; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { +define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { ; ; ; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata @@ -757,7 +757,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] ; AMDGPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external ; AMDGPU-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] ; AMDGPU: 3: @@ -816,7 +816,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 { ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]] ; NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 -; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external ; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] ; NVPTX: 3: @@ -868,10 +868,10 @@ define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., pt ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] ; AMDGPU-NEXT: ret void ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; AMDGPU-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0) ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) { @@ -885,10 +885,10 @@ define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., pt ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] ; NVPTX-NEXT: ret void ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]] ; NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0) ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] ; entry: br label %for.cond @@ -1069,7 +1069,6 @@ attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" } attributes #11 = { convergent } !omp_offload.info = !{!0, !1, !2, !3, !4, !5} -!nvvm.annotations = !{!6, !7, !8, !9, !10, !11} !llvm.module.flags = !{!12, !13, !14, !15, !16} !llvm.ident = !{!17} @@ -1079,12 +1078,6 @@ attributes #11 = { convergent } !3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} !4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} !5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -!6 = !{ptr @spmd_callees, !"kernel", i32 1} -!7 = !{ptr @spmd_and_non_spmd_callees_metadata, !"kernel", i32 1} -!8 = !{ptr @spmd_and_non_spmd_callee, !"kernel", i32 1} -!9 = !{ptr @spmd_callees_metadata, !"kernel", i32 1} -!10 = !{i32 1} -!11 = !{i32 1} !12 = !{i32 1, !"wchar_size", i32 4} !13 = !{i32 7, !"openmp", i32 50} !14 = !{i32 7, !"openmp-device", i32 50} @@ -1139,29 +1132,24 @@ attributes #11 = { convergent } ; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU: [[META6:![0-9]+]] = !{ptr @spmd_callees, !"kernel", i32 1} -; AMDGPU: [[META7:![0-9]+]] = !{ptr @spmd_and_non_spmd_callees_metadata, !"kernel", i32 1} -; AMDGPU: [[META8:![0-9]+]] = !{ptr @spmd_and_non_spmd_callee, !"kernel", i32 1} -; AMDGPU: [[META9:![0-9]+]] = !{ptr @spmd_callees_metadata, !"kernel", i32 1} -; AMDGPU: [[META10:![0-9]+]] = !{i32 1} -; AMDGPU: [[META11:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU: [[META12:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU: [[META14:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; AMDGPU: [[META15:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU: [[META16:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; AMDGPU: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0} -; AMDGPU: [[META18]] = !{!"int", [[META19:![0-9]+]], i64 0} -; AMDGPU: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0} -; AMDGPU: [[META20]] = !{!"Simple C/C++ TBAA"} -; AMDGPU: [[LOOP21]] = distinct !{[[LOOP21]], [[META22:![0-9]+]], [[META23:![0-9]+]]} -; AMDGPU: [[META22]] = !{!"llvm.loop.mustprogress"} -; AMDGPU: [[META23]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU: [[LOOP24]] = distinct !{[[LOOP24]], [[META22]], [[META23]]} -; AMDGPU: [[TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0} -; AMDGPU: [[META26]] = !{!"any pointer", [[META19]], i64 0} -; AMDGPU: [[LOOP27]] = distinct !{[[LOOP27]], [[META22]], [[META23]]} -; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META22]], [[META23]]} +; AMDGPU: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"} +; AMDGPU: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"} +; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"} +; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5} ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} @@ -1169,27 +1157,22 @@ attributes #11 = { convergent } ; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} ; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX: [[META6:![0-9]+]] = !{ptr @spmd_callees, !"kernel", i32 1} -; NVPTX: [[META7:![0-9]+]] = !{ptr @spmd_and_non_spmd_callees_metadata, !"kernel", i32 1} -; NVPTX: [[META8:![0-9]+]] = !{ptr @spmd_and_non_spmd_callee, !"kernel", i32 1} -; NVPTX: [[META9:![0-9]+]] = !{ptr @spmd_callees_metadata, !"kernel", i32 1} -; NVPTX: [[META10:![0-9]+]] = !{i32 1} -; NVPTX: [[META11:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX: [[META12:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX: [[META14:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; NVPTX: [[META15:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX: [[META16:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; NVPTX: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0} -; NVPTX: [[META18]] = !{!"int", [[META19:![0-9]+]], i64 0} -; NVPTX: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0} -; NVPTX: [[META20]] = !{!"Simple C/C++ TBAA"} -; NVPTX: [[LOOP21]] = distinct !{[[LOOP21]], [[META22:![0-9]+]], [[META23:![0-9]+]]} -; NVPTX: [[META22]] = !{!"llvm.loop.mustprogress"} -; NVPTX: [[META23]] = !{!"llvm.loop.unroll.disable"} -; NVPTX: [[LOOP24]] = distinct !{[[LOOP24]], [[META22]], [[META23]]} -; NVPTX: [[TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0} -; NVPTX: [[META26]] = !{!"any pointer", [[META19]], i64 0} -; NVPTX: [[LOOP27]] = distinct !{[[LOOP27]], [[META22]], [[META23]]} -; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META22]], [[META23]]} +; NVPTX: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0} +; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0} +; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0} +; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"} +; NVPTX: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]} +; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"} +; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"} +; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]} +; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0} +; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]} +; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll index f28f61e053275..1cfce147ac81e 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll @@ -57,7 +57,7 @@ target triple = "nvptx64" ; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null } ; CHECK-DISABLE-SPMDIZATION: @__omp_outlined___wrapper.ID = private constant i8 undef ;. -define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -147,7 +147,7 @@ define weak i32 @__kmpc_target_init(ptr, ptr) { declare void @__kmpc_target_deinit() ; Function Attrs: convergent noinline norecurse nounwind -define weak void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 { +define weak ptx_kernel void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 { ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: @@ -397,14 +397,11 @@ attributes #4 = { alwaysinline } attributes #5 = { convergent } !omp_offload.info = !{!0, !1} -!nvvm.annotations = !{!2, !3} !llvm.module.flags = !{!4, !5, !6, !7, !8} !llvm.ident = !{!9} !0 = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} !1 = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} -!2 = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1} -!3 = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1} !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"openmp", i32 50} !6 = !{i32 7, !"openmp-device", i32 50} @@ -434,23 +431,19 @@ attributes #5 = { convergent } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} -; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1} -; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} ;. ; CHECK-DISABLE-SPMDIZATION: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0} ; CHECK-DISABLE-SPMDIZATION: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1} -; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1} -; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1} -; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} -; CHECK-DISABLE-SPMDIZATION: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2} +; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll index f5a4cea9a841c..ef36937bc5734 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll @@ -62,7 +62,7 @@ target triple = "nvptx64" ; Function Attrs: convergent norecurse nounwind -define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 { +define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 { entry: %captured_vars_addrs.i.i = alloca [0 x ptr], align 8 %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment, ptr %dyn) #3, !dbg !18 @@ -107,7 +107,7 @@ declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3 declare void @__kmpc_target_deinit() local_unnamed_addr ; Function Attrs: norecurse nounwind -define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 { +define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 { entry: %captured_vars_addrs.i2.i = alloca [0 x ptr], align 8 %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment, ptr %dyn) #3, !dbg !33 @@ -175,7 +175,6 @@ attributes #7 = { "llvm.assume"="ompx_spmd_amenable" } !llvm.dbg.cu = !{!0} !omp_offload.info = !{!3, !4} -!nvvm.annotations = !{!5, !6} !llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13} !llvm.ident = !{!14} @@ -184,8 +183,6 @@ attributes #7 = { "llvm.assume"="ompx_spmd_amenable" } !2 = !{} !3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1} !4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0} -!5 = !{ptr @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1} -!6 = !{ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1} !7 = !{i32 7, !"Dwarf Version", i32 2} !8 = !{i32 2, !"Debug Info Version", i32 3} !9 = !{i32 1, !"wchar_size", i32 4} diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll index 5e2abbae1811c..2842dfd030b11 100644 --- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll +++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll @@ -53,7 +53,7 @@ target triple = "amdgcn-amd-amdhsa" ; CHECK: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1 ; CHECK: @kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null } ;. -define void @kernel(ptr %dyn) "kernel" { +define amdgpu_kernel void @kernel(ptr %dyn) "kernel" { ; ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel @@ -144,7 +144,7 @@ define void @test_assume() { } ; We can't ignore the sync, hence this might store 2 into %p -define void @kernel2(ptr %p) "kernel" { +define amdgpu_kernel void @kernel2(ptr %p) "kernel" { ; CHECK-LABEL: define {{[^@]+}}@kernel2 ; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: store i32 1, ptr addrspace(3) @X, align 4 @@ -163,7 +163,7 @@ define void @kernel2(ptr %p) "kernel" { } ; We can't ignore the sync, hence this might store 2 into %p -define void @kernel3(ptr %p) "kernel" { +define amdgpu_kernel void @kernel3(ptr %p) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel3 ; TUNIT-SAME: (ptr [[P:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: store i32 1, ptr addrspace(3) @X, align 4 @@ -199,7 +199,7 @@ define void @sync_def() { ret void } -define void @kernel4a1(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4a1(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel4a1 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: store i32 0, ptr addrspace(3) @QA1, align 4 @@ -242,7 +242,7 @@ S: } ; We should not replace the load or delete the second store. -define void @kernel4b1(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4b1(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel4b1 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: store i32 0, ptr addrspace(3) @QB1, align 4 @@ -281,7 +281,7 @@ S: ret void } -define void @kernel4a2(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4a2(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel4a2 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] @@ -317,7 +317,7 @@ S: } ; FIXME: We should not replace the load with undef. -define void @kernel4b2(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4b2(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel4b2 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] @@ -349,7 +349,7 @@ S: ret void } -define void @kernel4a3(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4a3(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel4a3 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: store i32 0, ptr addrspace(3) @QA3, align 4 @@ -401,7 +401,7 @@ S: } ; The load of QB3 should not be simplified to 0. -define void @kernel4b3(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4b3(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel4b3 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: store i32 0, ptr addrspace(3) @QB3, align 4 @@ -453,7 +453,7 @@ S: } -define void @kernel4c1(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4c1(i1 %c) "kernel" { ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel4c1 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { @@ -488,7 +488,7 @@ S: } ; We should not replace the load or delete the second store. -define void @kernel4d1(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4d1(i1 %c) "kernel" { ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel4d1 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { @@ -529,7 +529,7 @@ S: ret void } -define void @kernel4c2(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4c2(i1 %c) "kernel" { ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel4c2 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { @@ -563,7 +563,7 @@ S: } ; We should not replace the load with undef. -define void @kernel4d2(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4d2(i1 %c) "kernel" { ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel4d2 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { @@ -595,7 +595,7 @@ S: ret void } -define void @kernel4c3(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4c3(i1 %c) "kernel" { ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel4c3 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { @@ -629,7 +629,7 @@ S: } ; We should not replace the load with undef. -define void @kernel4d3(i1 %c) "kernel" { +define amdgpu_kernel void @kernel4d3(i1 %c) "kernel" { ; TUNIT: Function Attrs: norecurse ; TUNIT-LABEL: define {{[^@]+}}@kernel4d3 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] { @@ -661,7 +661,7 @@ S: ret void } -define void @kernel_unknown_and_aligned1(i1 %c) "kernel" { +define amdgpu_kernel void @kernel_unknown_and_aligned1(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned1 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] @@ -700,7 +700,7 @@ S: ret void } -define void @kernel_unknown_and_aligned2(i1 %c) "kernel" { +define amdgpu_kernel void @kernel_unknown_and_aligned2(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned2 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] @@ -741,7 +741,7 @@ S: ret void } -define void @kernel_unknown_and_aligned3(i1 %c) "kernel" { +define amdgpu_kernel void @kernel_unknown_and_aligned3(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned3 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] @@ -782,7 +782,7 @@ S: ret void } -define void @kernel_unknown_and_not_aligned1(i1 %c) "kernel" { +define amdgpu_kernel void @kernel_unknown_and_not_aligned1(i1 %c) "kernel" { ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_not_aligned1 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { ; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] @@ -828,29 +828,9 @@ declare void @__kmpc_target_deinit() nocallback declare void @llvm.assume(i1) !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20} !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} -!2 = !{ptr @kernel, !"kernel", i32 1} -!3 = !{ptr @kernel2, !"kernel", i32 1} -!4 = !{ptr @kernel3, !"kernel", i32 1} -!5 = !{ptr @kernel4a1, !"kernel", i32 1} -!6 = !{ptr @kernel4b1, !"kernel", i32 1} -!7 = !{ptr @kernel4a2, !"kernel", i32 1} -!8 = !{ptr @kernel4b2, !"kernel", i32 1} -!9 = !{ptr @kernel4a3, !"kernel", i32 1} -!10 = !{ptr @kernel4b3, !"kernel", i32 1} -!11 = !{ptr @kernel4c1, !"kernel", i32 1} -!12 = !{ptr @kernel4d1, !"kernel", i32 1} -!13 = !{ptr @kernel4c2, !"kernel", i32 1} -!14 = !{ptr @kernel4d2, !"kernel", i32 1} -!15 = !{ptr @kernel4c3, !"kernel", i32 1} -!16 = !{ptr @kernel4d3, !"kernel", i32 1} -!17 = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1} -!18 = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1} -!19 = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1} -!20 = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1} ;. ; TUNIT: attributes #[[ATTR0]] = { norecurse "kernel" } @@ -872,45 +852,7 @@ declare void @llvm.assume(i1) ;. ; TUNIT: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; TUNIT: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1} -; TUNIT: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1} -; TUNIT: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1} -; TUNIT: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1} -; TUNIT: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1} -; TUNIT: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1} -; TUNIT: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1} -; TUNIT: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1} -; TUNIT: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1} -; TUNIT: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1} -; TUNIT: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1} -; TUNIT: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1} -; TUNIT: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1} -; TUNIT: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1} -; TUNIT: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1} -; TUNIT: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1} -; TUNIT: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1} -; TUNIT: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1} -; TUNIT: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1} ;. ; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CGSCC: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1} -; CGSCC: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1} -; CGSCC: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1} -; CGSCC: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1} -; CGSCC: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1} -; CGSCC: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1} -; CGSCC: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1} -; CGSCC: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1} -; CGSCC: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1} -; CGSCC: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1} -; CGSCC: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1} -; CGSCC: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1} -; CGSCC: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1} -; CGSCC: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1} -; CGSCC: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1} -; CGSCC: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1} -; CGSCC: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1} -; CGSCC: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1} -; CGSCC: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1} ;. diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 42616155f0cc3..64fef02c8f3f8 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -7622,26 +7622,6 @@ TEST_F(OpenMPIRBuilderTest, createGPUOffloadEntry) { /* Size = */ 0, /* Flags = */ 0, GlobalValue::WeakAnyLinkage); - // Check nvvm.annotations only created for GPU kernels - NamedMDNode *MD = M->getNamedMetadata("nvvm.annotations"); - EXPECT_NE(MD, nullptr); - EXPECT_EQ(MD->getNumOperands(), 1u); - - MDNode *Annotations = MD->getOperand(0); - EXPECT_EQ(Annotations->getNumOperands(), 3u); - - Constant *ConstVal = - dyn_cast(Annotations->getOperand(0))->getValue(); - EXPECT_TRUE(isa(Fn)); - EXPECT_EQ(ConstVal, cast(Fn)); - - EXPECT_TRUE(Annotations->getOperand(1).equalsStr("kernel")); - - EXPECT_TRUE(mdconst::hasa(Annotations->getOperand(2))); - APInt IntVal = - mdconst::extract(Annotations->getOperand(2))->getValue(); - EXPECT_EQ(IntVal, 1); - // Check kernel attributes EXPECT_TRUE(Fn->hasFnAttribute("kernel")); EXPECT_TRUE(Fn->hasFnAttribute(Attribute::MustProgress)); From 4ea44eb1e292369b0b3f2f8ad4680081558f1e01 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Fri, 24 Jan 2025 16:56:26 -0800 Subject: [PATCH 072/432] [WebAssembly] Fix EH feature flags when compiling multiple files (#124374) #124042 caused a problem that when invoking `clang` with multiple files, the static `HasRun` variables were set when processing the first file so the appropriate feature flags were not added from the second file. This fixes the problem by making those `HasRun` variables just normal variables within the enclosing function. --- clang/lib/Driver/ToolChains/WebAssembly.cpp | 13 +++++++------ clang/test/Driver/wasm-toolchain.c | 8 ++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp index eebe3becada65..bd25fd1a8933a 100644 --- a/clang/lib/Driver/ToolChains/WebAssembly.cpp +++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp @@ -344,12 +344,15 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, } } + bool HasBannedIncompatibleOptionsForWasmEHSjLj = false; + bool HasEnabledFeaturesForWasmEHSjLj = false; + // Bans incompatible options for Wasm EH / SjLj. We don't allow using // different modes for EH and SjLj. auto BanIncompatibleOptionsForWasmEHSjLj = [&](StringRef CurOption) { - static bool HasRun = false; - if (HasRun) + if (HasBannedIncompatibleOptionsForWasmEHSjLj) return; + HasBannedIncompatibleOptionsForWasmEHSjLj = true; if (DriverArgs.hasFlag(options::OPT_mno_exception_handing, options::OPT_mexception_handing, false)) getDriver().Diag(diag::err_drv_argument_not_allowed_with) @@ -373,14 +376,13 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, << CurOption << Option; } } - HasRun = true; }; // Enable necessary features for Wasm EH / SjLj in the backend. auto EnableFeaturesForWasmEHSjLj = [&]() { - static bool HasRun = false; - if (HasRun) + if (HasEnabledFeaturesForWasmEHSjLj) return; + HasEnabledFeaturesForWasmEHSjLj = true; CC1Args.push_back("-target-feature"); CC1Args.push_back("+exception-handling"); // The standardized Wasm EH spec requires multivalue and reference-types. @@ -390,7 +392,6 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, CC1Args.push_back("+reference-types"); // Backend needs '-exception-model=wasm' to use Wasm EH instructions CC1Args.push_back("-exception-model=wasm"); - HasRun = true; }; if (DriverArgs.getLastArg(options::OPT_fwasm_exceptions)) { diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c index 2d14052082776..f516a4e457da7 100644 --- a/clang/test/Driver/wasm-toolchain.c +++ b/clang/test/Driver/wasm-toolchain.c @@ -224,6 +224,14 @@ // RUN: | FileCheck -check-prefix=WASM_LEGACY_EH_NO_EH %s // WASM_LEGACY_EH_NO_EH: invalid argument '-wasm-use-legacy-eh' not allowed with '-mno-exception-handling' +// When invoking clang with multiple files in a single command line, target +// feature flags should be equally added to the multiple clang-cc1 command lines +// RUN: %clang -### --target=wasm32-unknown-unknown \ +// RUN: --sysroot=/foo %s %s -mllvm -wasm-enable-sjlj 2>&1 \ +// RUN: | FileCheck -check-prefix=WASM_SJLJ_MULTI_FILES %s +// WASM_SJLJ_MULTI_FILES: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-target-feature" "+multivalue" "-target-feature" "+reference-types" "-exception-model=wasm" +// WASM_SJLJ_MULTI_FILES: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-target-feature" "+multivalue" "-target-feature" "+reference-types" "-exception-model=wasm" + // RUN: %clang -### %s -fsanitize=address --target=wasm32-unknown-emscripten 2>&1 | FileCheck -check-prefix=CHECK-ASAN-EMSCRIPTEN %s // CHECK-ASAN-EMSCRIPTEN: "-fsanitize=address" // CHECK-ASAN-EMSCRIPTEN: "-fsanitize-address-globals-dead-stripping" From 8e31050bc2e02d7a3c654def7d7af899ce1cdb1d Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 24 Jan 2025 17:57:04 -0800 Subject: [PATCH 073/432] [clang-format] Fix a bug in annotating overloaded co_await decl (#124240) Fixes #124223. --- clang/lib/Format/TokenAnnotator.cpp | 2 +- clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index f36cf7b638e0d..bc41d43d1438c 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -3784,7 +3784,7 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts, return Next; if (Next->is(TT_OverloadedOperator)) continue; - if (Next->isOneOf(tok::kw_new, tok::kw_delete)) { + if (Next->isOneOf(tok::kw_new, tok::kw_delete, tok::kw_co_await)) { // For 'new[]' and 'delete[]'. if (Next->Next && Next->Next->startsSequence(tok::l_square, tok::r_square)) { diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 9ac60ce73750b..10587449dcea9 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1025,6 +1025,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) { EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_OverloadedOperatorLParen); EXPECT_TOKEN(Tokens[8], tok::amp, TT_PointerOrReference); EXPECT_TOKEN(Tokens[12], tok::amp, TT_PointerOrReference); + + Tokens = annotate("SomeLoooooooooooooooooType::Awaitable\n" + "SomeLoooooooooooooooooType::operator co_await();"); + ASSERT_EQ(Tokens.size(), 11u) << Tokens; + EXPECT_TOKEN(Tokens[3], tok::identifier, TT_FunctionDeclarationName); + EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_OverloadedOperatorLParen); } TEST_F(TokenAnnotatorTest, OverloadedOperatorInTemplate) { From 3b35b4c7f9141c59fbac415e335489494b7d507e Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Fri, 24 Jan 2025 18:08:44 -0800 Subject: [PATCH 074/432] [mlir] Allow fallback from file line col range to loc (#124321) This was discussed during the original review but I made it stricter than discussed. Making it a pure view but adding a helper for bytecode serialization (I could avoid the helper, but it ends up with more logic and stronger coupling). --- mlir/include/mlir/IR/BuiltinDialectBytecode.td | 4 ++-- mlir/include/mlir/IR/Location.h | 9 +++++---- mlir/lib/IR/Location.cpp | 6 ++---- mlir/test/Target/LLVMIR/llvmir-debug.mlir | 3 +++ 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/mlir/include/mlir/IR/BuiltinDialectBytecode.td b/mlir/include/mlir/IR/BuiltinDialectBytecode.td index 87da8fd3568fa..0208e8cdbf293 100644 --- a/mlir/include/mlir/IR/BuiltinDialectBytecode.td +++ b/mlir/include/mlir/IR/BuiltinDialectBytecode.td @@ -104,7 +104,7 @@ def FileLineColRange : DialectAttribute<(attr WithPrinter<"writeFileLineColRangeLocs($_writer, $_name)">>>>:$rawLocData )> { let cBuilder = "getFileLineColRange(context, filename, rawLocData)"; - let printerPredicate = "!::llvm::isa($_val)"; + let printerPredicate = "!isStrictFileLineColLoc($_val)"; } def FileLineColLoc : DialectAttribute<(attr @@ -112,7 +112,7 @@ def FileLineColLoc : DialectAttribute<(attr VarInt:$start_line, VarInt:$start_column )> { - let printerPredicate = "::llvm::isa($_val)"; + let printerPredicate = "isStrictFileLineColLoc($_val)"; } } diff --git a/mlir/include/mlir/IR/Location.h b/mlir/include/mlir/IR/Location.h index e206501f5ee6a..8ce36ed415ac1 100644 --- a/mlir/include/mlir/IR/Location.h +++ b/mlir/include/mlir/IR/Location.h @@ -177,7 +177,7 @@ class FusedLocWith : public FusedLoc { /// column number. This is similar to the type of location that you get from /// most source languages. /// -/// FileLineColLoc is a FileLineColRange with exactly one line and column. +/// FileLineColLoc is a view to FileLineColRange with one line and column. class FileLineColLoc : public FileLineColRange { public: using FileLineColRange::FileLineColRange; @@ -190,11 +190,12 @@ class FileLineColLoc : public FileLineColRange { StringAttr getFilename() const; unsigned getLine() const; unsigned getColumn() const; - - /// Methods for support type inquiry through isa, cast, and dyn_cast. - static bool classof(Attribute attr); }; +/// Returns true iff the given location is a FileLineColRange with exactly one +/// line and column. +bool isStrictFileLineColLoc(Location loc); + //===----------------------------------------------------------------------===// // OpaqueLoc //===----------------------------------------------------------------------===// diff --git a/mlir/lib/IR/Location.cpp b/mlir/lib/IR/Location.cpp index ce78d30ee0a52..7a4df4fbd46d9 100644 --- a/mlir/lib/IR/Location.cpp +++ b/mlir/lib/IR/Location.cpp @@ -177,10 +177,8 @@ unsigned FileLineColLoc::getLine() const { return getStartLine(); } unsigned FileLineColLoc::getColumn() const { return getStartColumn(); } -bool FileLineColLoc::classof(Attribute attr) { - // This could also have been for <= 2. But given this is matching previous - // behavior, it is left as is. - if (auto range = mlir::dyn_cast(attr)) +bool mlir::isStrictFileLineColLoc(Location loc) { + if (auto range = mlir::dyn_cast(loc)) return range.getImpl()->size() == 2; return false; } diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir index eac2c5090a5b5..d15274311d745 100644 --- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir @@ -115,6 +115,9 @@ llvm.func @func_with_debug(%arg: i64) { // CHECK: call void @func_no_debug(), !dbg ![[FILE_LOC:[0-9]+]] llvm.call @func_no_debug() : () -> () loc("foo.mlir":1:2) + // CHECK: call void @func_no_debug(), !dbg ![[FILE_LOC:[0-9]+]] + llvm.call @func_no_debug() : () -> () loc("foo.mlir":1:2 to 5:6) + // CHECK: call void @func_no_debug(), !dbg ![[NAMED_LOC:[0-9]+]] llvm.call @func_no_debug() : () -> () loc("named"("foo.mlir":10:10)) From ac1ba1f9dd7013852cd27f514467f57ee0e6ed16 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 24 Jan 2025 18:30:28 -0800 Subject: [PATCH 075/432] [CodeGen] Introduce a VirtRegOrUnit class to hold virtual reg or physical reg unit. NFC (#123768) LiveIntervals and MachineVerifier were previously using Register to store this, but reg units are different than physical registers. One important difference is that 0 is a valid reg unit number, but it is not a valid phyiscal register. This patch introduces a new VirtRegOrUnit class that is distinct from Register. It can be be converted to/from a virtual Register or a MCRegUnit. I've made all conversions explicit and used assertions to check the validity. I also fixed a place in MachineVerifier that was ignoring reg unit 0. --- llvm/include/llvm/CodeGen/Register.h | 31 +++ .../include/llvm/CodeGen/TargetRegisterInfo.h | 4 +- llvm/lib/CodeGen/LiveIntervals.cpp | 31 +-- llvm/lib/CodeGen/MachineVerifier.cpp | 182 +++++++++--------- 4 files changed, 144 insertions(+), 104 deletions(-) diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h index fac5f00110ef7..f8c6159a3c2dc 100644 --- a/llvm/include/llvm/CodeGen/Register.h +++ b/llvm/include/llvm/CodeGen/Register.h @@ -160,6 +160,37 @@ template <> struct DenseMapInfo { } }; +/// Wrapper class representing a virtual register or register unit. +class VirtRegOrUnit { + unsigned VRegOrUnit; + +public: + constexpr explicit VirtRegOrUnit(MCRegUnit Unit) : VRegOrUnit(Unit) { + assert(!Register::isVirtualRegister(VRegOrUnit)); + } + constexpr explicit VirtRegOrUnit(Register Reg) : VRegOrUnit(Reg.id()) { + assert(Reg.isVirtual()); + } + + constexpr bool isVirtualReg() const { + return Register::isVirtualRegister(VRegOrUnit); + } + + constexpr MCRegUnit asMCRegUnit() const { + assert(!isVirtualReg() && "Not a register unit"); + return VRegOrUnit; + } + + constexpr Register asVirtualReg() const { + assert(isVirtualReg() && "Not a virtual register"); + return Register(VRegOrUnit); + } + + constexpr bool operator==(const VirtRegOrUnit &Other) const { + return VRegOrUnit == Other.VRegOrUnit; + } +}; + } // namespace llvm #endif // LLVM_CODEGEN_REGISTER_H diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 0bf72637de398..63460f5a0dae3 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -466,9 +466,9 @@ class TargetRegisterInfo : public MCRegisterInfo { } /// Returns true if Reg contains RegUnit. - bool hasRegUnit(MCRegister Reg, Register RegUnit) const { + bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const { for (MCRegUnit Unit : regunits(Reg)) - if (Register(Unit) == RegUnit) + if (Unit == RegUnit) return true; return false; } diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 4fdfcf547542d..3485a27335f13 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -1080,10 +1080,10 @@ class LiveIntervals::HMEditor { for (LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & LaneMask).none()) continue; - updateRange(S, Reg, S.LaneMask); + updateRange(S, VirtRegOrUnit(Reg), S.LaneMask); } } - updateRange(LI, Reg, LaneBitmask::getNone()); + updateRange(LI, VirtRegOrUnit(Reg), LaneBitmask::getNone()); // If main range has a hole and we are moving a subrange use across // the hole updateRange() cannot properly handle it since it only // gets the LiveRange and not the whole LiveInterval. As a result @@ -1110,7 +1110,7 @@ class LiveIntervals::HMEditor { // precomputed live range. for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) if (LiveRange *LR = getRegUnitLI(Unit)) - updateRange(*LR, Unit, LaneBitmask::getNone()); + updateRange(*LR, VirtRegOrUnit(Unit), LaneBitmask::getNone()); } if (hasRegMask) updateRegMaskSlots(); @@ -1119,24 +1119,25 @@ class LiveIntervals::HMEditor { private: /// Update a single live range, assuming an instruction has been moved from /// OldIdx to NewIdx. - void updateRange(LiveRange &LR, Register Reg, LaneBitmask LaneMask) { + void updateRange(LiveRange &LR, VirtRegOrUnit VRegOrUnit, + LaneBitmask LaneMask) { if (!Updated.insert(&LR).second) return; LLVM_DEBUG({ dbgs() << " "; - if (Reg.isVirtual()) { - dbgs() << printReg(Reg); + if (VRegOrUnit.isVirtualReg()) { + dbgs() << printReg(VRegOrUnit.asVirtualReg()); if (LaneMask.any()) dbgs() << " L" << PrintLaneMask(LaneMask); } else { - dbgs() << printRegUnit(Reg, &TRI); + dbgs() << printRegUnit(VRegOrUnit.asMCRegUnit(), &TRI); } dbgs() << ":\t" << LR << '\n'; }); if (SlotIndex::isEarlierInstr(OldIdx, NewIdx)) handleMoveDown(LR); else - handleMoveUp(LR, Reg, LaneMask); + handleMoveUp(LR, VRegOrUnit, LaneMask); LLVM_DEBUG(dbgs() << " -->\t" << LR << '\n'); assert(LR.verify()); } @@ -1316,7 +1317,8 @@ class LiveIntervals::HMEditor { /// Update LR to reflect an instruction has been moved upwards from OldIdx /// to NewIdx (NewIdx < OldIdx). - void handleMoveUp(LiveRange &LR, Register Reg, LaneBitmask LaneMask) { + void handleMoveUp(LiveRange &LR, VirtRegOrUnit VRegOrUnit, + LaneBitmask LaneMask) { LiveRange::iterator E = LR.end(); // Segment going into OldIdx. LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex()); @@ -1340,7 +1342,7 @@ class LiveIntervals::HMEditor { SlotIndex DefBeforeOldIdx = std::max(OldIdxIn->start.getDeadSlot(), NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber())); - OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask); + OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, VRegOrUnit, LaneMask); // Did we have a Def at OldIdx? If not we are done now. OldIdxOut = std::next(OldIdxIn); @@ -1498,11 +1500,12 @@ class LiveIntervals::HMEditor { } // Return the last use of reg between NewIdx and OldIdx. - SlotIndex findLastUseBefore(SlotIndex Before, Register Reg, + SlotIndex findLastUseBefore(SlotIndex Before, VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) { - if (Reg.isVirtual()) { + if (VRegOrUnit.isVirtualReg()) { SlotIndex LastUse = Before; - for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + for (MachineOperand &MO : + MRI.use_nodbg_operands(VRegOrUnit.asVirtualReg())) { if (MO.isUndef()) continue; unsigned SubReg = MO.getSubReg(); @@ -1545,7 +1548,7 @@ class LiveIntervals::HMEditor { // Check if MII uses Reg. for (MIBundleOperands MO(*MII); MO.isValid(); ++MO) if (MO->isReg() && !MO->isUndef() && MO->getReg().isPhysical() && - TRI.hasRegUnit(MO->getReg(), Reg)) + TRI.hasRegUnit(MO->getReg(), VRegOrUnit.asMCRegUnit())) return Idx.getRegSlot(); } // Didn't reach Before. It must be the first instruction in the block. diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index d41b11307e7bc..46ae3c6b05540 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -313,7 +313,7 @@ struct MachineVerifier { void report(const Twine &Msg, const MachineInstr *MI); void report_context(const LiveInterval &LI) const; - void report_context(const LiveRange &LR, Register VRegUnit, + void report_context(const LiveRange &LR, VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) const; void report_context(const LiveRange::Segment &S) const; void report_context(const VNInfo &VNI) const; @@ -322,18 +322,18 @@ struct MachineVerifier { void report_context_liverange(const LiveRange &LR) const; void report_context_lanemask(LaneBitmask LaneMask) const; void report_context_vreg(Register VReg) const; - void report_context_vreg_regunit(Register VRegOrUnit) const; + void report_context_vreg_regunit(VirtRegOrUnit VRegOrUnit) const; void verifyInlineAsm(const MachineInstr *MI); void checkLiveness(const MachineOperand *MO, unsigned MONum); void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum, SlotIndex UseIdx, const LiveRange &LR, - Register VRegOrUnit, + VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask = LaneBitmask::getNone()); void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum, SlotIndex DefIdx, const LiveRange &LR, - Register VRegOrUnit, bool SubRangeCheck = false, + VirtRegOrUnit VRegOrUnit, bool SubRangeCheck = false, LaneBitmask LaneMask = LaneBitmask::getNone()); void markReachable(const MachineBasicBlock *MBB); @@ -344,12 +344,12 @@ struct MachineVerifier { void verifyLiveVariables(); void verifyLiveIntervals(); void verifyLiveInterval(const LiveInterval &); - void verifyLiveRangeValue(const LiveRange &, const VNInfo *, Register, + void verifyLiveRangeValue(const LiveRange &, const VNInfo *, VirtRegOrUnit, LaneBitmask); void verifyLiveRangeSegment(const LiveRange &, - const LiveRange::const_iterator I, Register, + const LiveRange::const_iterator I, VirtRegOrUnit, LaneBitmask); - void verifyLiveRange(const LiveRange &, Register, + void verifyLiveRange(const LiveRange &, VirtRegOrUnit, LaneBitmask LaneMask = LaneBitmask::getNone()); void verifyStackFrame(); @@ -636,10 +636,11 @@ void MachineVerifier::report_context(const LiveInterval &LI) const { OS << "- interval: " << LI << '\n'; } -void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit, +void MachineVerifier::report_context(const LiveRange &LR, + VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) const { report_context_liverange(LR); - report_context_vreg_regunit(VRegUnit); + report_context_vreg_regunit(VRegOrUnit); if (LaneMask.any()) report_context_lanemask(LaneMask); } @@ -664,11 +665,13 @@ void MachineVerifier::report_context_vreg(Register VReg) const { OS << "- v. register: " << printReg(VReg, TRI) << '\n'; } -void MachineVerifier::report_context_vreg_regunit(Register VRegOrUnit) const { - if (VRegOrUnit.isVirtual()) { - report_context_vreg(VRegOrUnit); +void MachineVerifier::report_context_vreg_regunit( + VirtRegOrUnit VRegOrUnit) const { + if (VRegOrUnit.isVirtualReg()) { + report_context_vreg(VRegOrUnit.asVirtualReg()); } else { - OS << "- regunit: " << printRegUnit(VRegOrUnit, TRI) << '\n'; + OS << "- regunit: " << printRegUnit(VRegOrUnit.asMCRegUnit(), TRI) + << '\n'; } } @@ -2828,7 +2831,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO, unsigned MONum, SlotIndex UseIdx, const LiveRange &LR, - Register VRegOrUnit, + VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) { const MachineInstr *MI = MO->getParent(); @@ -2863,7 +2866,7 @@ void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO, void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO, unsigned MONum, SlotIndex DefIdx, const LiveRange &LR, - Register VRegOrUnit, + VirtRegOrUnit VRegOrUnit, bool SubRangeCheck, LaneBitmask LaneMask) { if (!LR.verify()) { @@ -2908,7 +2911,7 @@ void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO, if (MO->isDead()) { LiveQueryResult LRQ = LR.Query(DefIdx); if (!LRQ.isDeadDef()) { - assert(VRegOrUnit.isVirtual() && "Expecting a virtual register."); + assert(VRegOrUnit.isVirtualReg() && "Expecting a virtual register."); // A dead subreg def only tells us that the specific subreg is dead. There // could be other non-dead defs of other subregs, or we could have other // parts of the register being live through the instruction. So unless we @@ -2973,13 +2976,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { if (MRI->isReservedRegUnit(Unit)) continue; if (const LiveRange *LR = LiveInts->getCachedRegUnit(Unit)) - checkLivenessAtUse(MO, MONum, UseIdx, *LR, Unit); + checkLivenessAtUse(MO, MONum, UseIdx, *LR, VirtRegOrUnit(Unit)); } } if (Reg.isVirtual()) { // This is a virtual register interval. - checkLivenessAtUse(MO, MONum, UseIdx, *LI, Reg); + checkLivenessAtUse(MO, MONum, UseIdx, *LI, VirtRegOrUnit(Reg)); if (LI->hasSubRanges() && !MO->isDef()) { LaneBitmask MOMask = SubRegIdx != 0 @@ -2989,7 +2992,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { for (const LiveInterval::SubRange &SR : LI->subranges()) { if ((MOMask & SR.LaneMask).none()) continue; - checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask); + checkLivenessAtUse(MO, MONum, UseIdx, SR, VirtRegOrUnit(Reg), + SR.LaneMask); LiveQueryResult LRQ = SR.Query(UseIdx); if (LRQ.valueIn() || (MI->isPHI() && LRQ.valueOut())) LiveInMask |= SR.LaneMask; @@ -3081,7 +3085,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber()); if (Reg.isVirtual()) { - checkLivenessAtDef(MO, MONum, DefIdx, *LI, Reg); + checkLivenessAtDef(MO, MONum, DefIdx, *LI, VirtRegOrUnit(Reg)); if (LI->hasSubRanges()) { LaneBitmask MOMask = SubRegIdx != 0 @@ -3090,7 +3094,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { for (const LiveInterval::SubRange &SR : LI->subranges()) { if ((SR.LaneMask & MOMask).none()) continue; - checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, true, SR.LaneMask); + checkLivenessAtDef(MO, MONum, DefIdx, SR, VirtRegOrUnit(Reg), true, + SR.LaneMask); } } } @@ -3532,11 +3537,12 @@ void MachineVerifier::verifyLiveIntervals() { // Verify all the cached regunit intervals. for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i) if (const LiveRange *LR = LiveInts->getCachedRegUnit(i)) - verifyLiveRange(*LR, i); + verifyLiveRange(*LR, VirtRegOrUnit(i)); } void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, - const VNInfo *VNI, Register Reg, + const VNInfo *VNI, + VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) { if (VNI->isUnused()) return; @@ -3545,14 +3551,14 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, if (!DefVNI) { report("Value not live at VNInfo def and not marked unused", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); return; } if (DefVNI != VNI) { report("Live segment at def has different VNInfo", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); return; } @@ -3560,7 +3566,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def); if (!MBB) { report("Invalid VNInfo definition index", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); return; } @@ -3568,7 +3574,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, if (VNI->isPHIDef()) { if (VNI->def != LiveInts->getMBBStartIdx(MBB)) { report("PHIDef VNInfo is not defined at MBB start", MBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); } return; @@ -3578,57 +3584,56 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def); if (!MI) { report("No instruction at VNInfo def index", MBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); return; } - if (Reg != 0) { - bool hasDef = false; - bool isEarlyClobber = false; - for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) { - if (!MOI->isReg() || !MOI->isDef()) + bool hasDef = false; + bool isEarlyClobber = false; + for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) { + if (!MOI->isReg() || !MOI->isDef()) + continue; + if (VRegOrUnit.isVirtualReg()) { + if (MOI->getReg() != VRegOrUnit.asVirtualReg()) continue; - if (Reg.isVirtual()) { - if (MOI->getReg() != Reg) - continue; - } else { - if (!MOI->getReg().isPhysical() || !TRI->hasRegUnit(MOI->getReg(), Reg)) - continue; - } - if (LaneMask.any() && - (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask).none()) + } else { + if (!MOI->getReg().isPhysical() || + !TRI->hasRegUnit(MOI->getReg(), VRegOrUnit.asMCRegUnit())) continue; - hasDef = true; - if (MOI->isEarlyClobber()) - isEarlyClobber = true; } + if (LaneMask.any() && + (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask).none()) + continue; + hasDef = true; + if (MOI->isEarlyClobber()) + isEarlyClobber = true; + } - if (!hasDef) { - report("Defining instruction does not modify register", MI); - report_context(LR, Reg, LaneMask); - report_context(*VNI); - } + if (!hasDef) { + report("Defining instruction does not modify register", MI); + report_context(LR, VRegOrUnit, LaneMask); + report_context(*VNI); + } - // Early clobber defs begin at USE slots, but other defs must begin at - // DEF slots. - if (isEarlyClobber) { - if (!VNI->def.isEarlyClobber()) { - report("Early clobber def must be at an early-clobber slot", MBB); - report_context(LR, Reg, LaneMask); - report_context(*VNI); - } - } else if (!VNI->def.isRegister()) { - report("Non-PHI, non-early clobber def must be at a register slot", MBB); - report_context(LR, Reg, LaneMask); + // Early clobber defs begin at USE slots, but other defs must begin at + // DEF slots. + if (isEarlyClobber) { + if (!VNI->def.isEarlyClobber()) { + report("Early clobber def must be at an early-clobber slot", MBB); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); } + } else if (!VNI->def.isRegister()) { + report("Non-PHI, non-early clobber def must be at a register slot", MBB); + report_context(LR, VRegOrUnit, LaneMask); + report_context(*VNI); } } void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, const LiveRange::const_iterator I, - Register Reg, + VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) { const LiveRange::Segment &S = *I; const VNInfo *VNI = S.valno; @@ -3636,28 +3641,28 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) { report("Foreign valno in live segment", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); report_context(*VNI); } if (VNI->isUnused()) { report("Live segment valno is marked unused", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start); if (!MBB) { report("Bad start of live segment, no basic block", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); return; } SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB); if (S.start != MBBStartIdx && S.start != VNI->def) { report("Live segment must begin at MBB entry or valno def", MBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } @@ -3665,7 +3670,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, LiveInts->getMBBFromIndex(S.end.getPrevSlot()); if (!EndMBB) { report("Bad end of live segment, no basic block", MF); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); return; } @@ -3673,7 +3678,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, // Checks for non-live-out segments. if (S.end != LiveInts->getMBBEndIdx(EndMBB)) { // RegUnit intervals are allowed dead phis. - if (!Reg.isVirtual() && VNI->isPHIDef() && S.start == VNI->def && + if (!VRegOrUnit.isVirtualReg() && VNI->isPHIDef() && S.start == VNI->def && S.end == VNI->def.getDeadSlot()) return; @@ -3682,7 +3687,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, LiveInts->getInstructionFromIndex(S.end.getPrevSlot()); if (!MI) { report("Live segment doesn't end at a valid instruction", EndMBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); return; } @@ -3690,7 +3695,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, // The block slot must refer to a basic block boundary. if (S.end.isBlock()) { report("Live segment ends at B slot of an instruction", EndMBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } @@ -3699,7 +3704,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, // That means there must be a dead def. if (!SlotIndex::isSameInstr(S.start, S.end)) { report("Live segment ending at dead slot spans instructions", EndMBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } } @@ -3715,21 +3720,21 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, report("Live segment ending at early clobber slot must be " "redefined by an EC def in the same instruction", EndMBB); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } } // The following checks only apply to virtual registers. Physreg liveness // is too weird to check. - if (Reg.isVirtual()) { + if (VRegOrUnit.isVirtualReg()) { // A live segment can end with either a redefinition, a kill flag on a // use, or a dead flag on a def. bool hasRead = false; bool hasSubRegDef = false; bool hasDeadDef = false; for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) { - if (!MOI->isReg() || MOI->getReg() != Reg) + if (!MOI->isReg() || MOI->getReg() != VRegOrUnit.asVirtualReg()) continue; unsigned Sub = MOI->getSubReg(); LaneBitmask SLM = @@ -3758,18 +3763,18 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, report( "Instruction ending live segment on dead slot has no dead flag", MI); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } } else { if (!hasRead) { // When tracking subregister liveness, the main range must start new // values on partial register writes, even if there is no read. - if (!MRI->shouldTrackSubRegLiveness(Reg) || LaneMask.any() || - !hasSubRegDef) { + if (!MRI->shouldTrackSubRegLiveness(VRegOrUnit.asVirtualReg()) || + LaneMask.any() || !hasSubRegDef) { report("Instruction ending live segment doesn't read the register", MI); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(S); } } @@ -3790,14 +3795,14 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, SmallVector Undefs; if (LaneMask.any()) { - LiveInterval &OwnerLI = LiveInts->getInterval(Reg); + LiveInterval &OwnerLI = LiveInts->getInterval(VRegOrUnit.asVirtualReg()); OwnerLI.computeSubRangeUndefs(Undefs, LaneMask, *MRI, *Indexes); } while (true) { assert(LiveInts->isLiveInToMBB(LR, &*MFI)); // We don't know how to track physregs into a landing pad. - if (!Reg.isVirtual() && MFI->isEHPad()) { + if (!VRegOrUnit.isVirtualReg() && MFI->isEHPad()) { if (&*MFI == EndMBB) break; ++MFI; @@ -3830,7 +3835,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, if (LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes)) continue; report("Register not marked live out of predecessor", Pred); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); report_context(*VNI); OS << " live into " << printMBBReference(*MFI) << '@' << LiveInts->getMBBStartIdx(&*MFI) << ", not live before " << PEnd @@ -3841,7 +3846,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, // Only PHI-defs can take different predecessor values. if (!IsPHI && PVNI != VNI) { report("Different value live out of predecessor", Pred); - report_context(LR, Reg, LaneMask); + report_context(LR, VRegOrUnit, LaneMask); OS << "Valno #" << PVNI->id << " live out of " << printMBBReference(*Pred) << '@' << PEnd << "\nValno #" << VNI->id << " live into " << printMBBReference(*MFI) << '@' @@ -3854,19 +3859,20 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, } } -void MachineVerifier::verifyLiveRange(const LiveRange &LR, Register Reg, +void MachineVerifier::verifyLiveRange(const LiveRange &LR, + VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask) { for (const VNInfo *VNI : LR.valnos) - verifyLiveRangeValue(LR, VNI, Reg, LaneMask); + verifyLiveRangeValue(LR, VNI, VRegOrUnit, LaneMask); for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I) - verifyLiveRangeSegment(LR, I, Reg, LaneMask); + verifyLiveRangeSegment(LR, I, VRegOrUnit, LaneMask); } void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) { Register Reg = LI.reg(); assert(Reg.isVirtual()); - verifyLiveRange(LI, Reg); + verifyLiveRange(LI, VirtRegOrUnit(Reg)); if (LI.hasSubRanges()) { LaneBitmask Mask; @@ -3882,10 +3888,10 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) { } if (SR.empty()) { report("Subrange must not be empty", MF); - report_context(SR, LI.reg(), SR.LaneMask); + report_context(SR, VirtRegOrUnit(LI.reg()), SR.LaneMask); } Mask |= SR.LaneMask; - verifyLiveRange(SR, LI.reg(), SR.LaneMask); + verifyLiveRange(SR, VirtRegOrUnit(LI.reg()), SR.LaneMask); if (!LI.covers(SR)) { report("A Subrange is not covered by the main range", MF); report_context(LI); From 8a6b44bf4cfe5df3db687a6b9519e99dbce8cf54 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Sat, 25 Jan 2025 03:55:09 +0000 Subject: [PATCH 076/432] Revert "[libc++] Fix tests for clang::no_specializations for C++17 and C++20" This reverts commit 4df9c17e5f436702ca4f5439322972b0385d629a. Reason: buildbot breakage (https://lab.llvm.org/buildbot/#/builders/24/builds/4598/steps/10/logs/stdio) --- libcxx/include/__type_traits/result_of.h | 2 +- .../ranges/no_specializations.verify.cpp | 4 +--- .../type_traits/no_specializations.verify.cpp | 24 +++++++------------ 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h index 8cc009dbe8baa..217ca70b4cd20 100644 --- a/libcxx/include/__type_traits/result_of.h +++ b/libcxx/include/__type_traits/result_of.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS) template -struct _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS result_of; +struct _LIBCPP_DEPRECATED_IN_CXX17 result_of; template struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {}; diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp index 489e3a6a73744..69d458a920558 100644 --- a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp @@ -13,9 +13,7 @@ #include -#include "test_macros.h" - -#if !__has_warning("-Winvalid-specialization") || TEST_STD_VER <= 20 +#if !__has_warning("-Winvalid-specialization") // expected-no-diagnostics #else struct S {}; diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index 807d01e381b49..e6d960667e8c0 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -36,22 +36,15 @@ SPECIALIZE_TRAIT(make_unsigned); // expected-error {{cannot be specialize SPECIALIZE_TRAIT(remove_all_extents); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_const); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_cv); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_extent); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_pointer); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_reference); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_volatile); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(underlying_type); // expected-error {{cannot be specialized}} - -# if TEST_STD_VER <= 17 -SPECIALIZE_TRAIT(result_of); // expected-error {{cannot be specialized}} -# endif - -# if TEST_STD_VER >= 20 -SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} -# endif +SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} # undef SPECIALIZE_TRAIT # define SPECIALIZE_UTT(Trait) \ @@ -103,6 +96,7 @@ SPECIALIZE_UTT(is_move_assignable); // expected-error 2 {{cannot SPECIALIZE_UTT(is_move_constructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_BTT(is_nothrow_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_copy_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_copy_constructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_default_constructible); // expected-error 2 {{cannot be specialized}} @@ -136,6 +130,7 @@ SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot SPECIALIZE_UTT(is_trivially_destructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_trivially_move_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_trivially_move_constructible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_union); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_unsigned); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_void); // expected-error 2 {{cannot be specialized}} @@ -145,12 +140,11 @@ SPECIALIZE_UTT(rank); // expected-error 2 {{cannot # if TEST_STD_VER <= 17 SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(result_of); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 20 -SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 23 @@ -177,8 +171,6 @@ struct std::conditional; // expected-error {{cannot be specialized}} template <> struct std::enable_if; // expected-error {{cannot be specialized}} -#if TEST_STD_VER >= 20 template <> struct std::integral_constant; // expected-error {{cannot be specialized}} #endif -#endif From 1f26ac10ca1bef40a80be8f81a6f109713bc586f Mon Sep 17 00:00:00 2001 From: mconst Date: Fri, 24 Jan 2025 20:03:57 -0800 Subject: [PATCH 077/432] [X86] Better handling of impossibly large stack frames (#124217) If you try to create a stack frame of 4 GiB or larger with a 32-bit stack pointer, we currently emit invalid instructions like `mov eax, 5000000000` (unless you specify `-fstack-clash-protection`, in which case we emit a trap instead). The trap seems nicer, so let's do that in all cases. This avoids emitting invalid instructions, and also fixes the "can't have 32-bit 16GB stack frame" assertion in `X86FrameLowering::emitSPUpdate()` (which used to be triggerable by user code, but is now correct). This was originally part of #124041. @phoebewang --- llvm/lib/Target/X86/X86FrameLowering.cpp | 13 +++++++--- llvm/test/CodeGen/X86/huge-stack-offset.ll | 8 +++--- .../CodeGen/X86/stack-clash-extra-huge.ll | 26 ++----------------- 3 files changed, 14 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 47cc6a18ef843..a7b60afb7f547 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -234,6 +234,14 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineInstr::MIFlag Flag = isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; + if (!Uses64BitFramePtr && !isUInt<32>(Offset)) { + // We're being asked to adjust a 32-bit stack pointer by 4 GiB or more. + // This might be unreachable code, so don't complain now; just trap if + // it's reached at runtime. + BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP)); + return; + } + uint64_t Chunk = (1LL << 31) - 1; MachineFunction &MF = *MBB.getParent(); @@ -829,10 +837,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } else { - // We're being asked to probe a stack frame that's 4 GiB or larger, - // but our stack pointer is only 32 bits. This might be unreachable - // code, so don't complain now; just trap if it's reached at runtime. - BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP)); + llvm_unreachable("Offset too large for 32-bit stack pointer"); } // while in the loop, use loop-invariant reg for CFI, diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll index e825328ccd89a..6629811a59b23 100644 --- a/llvm/test/CodeGen/X86/huge-stack-offset.ll +++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll @@ -13,11 +13,9 @@ define void @foo() nounwind { ; CHECK-64-NEXT: addq [[RAX]], %rsp ; CHECK-32-LABEL: foo: -; CHECK-32: movl $50000000{{..}}, %eax -; CHECK-32-NEXT: subl %eax, %esp +; CHECK-32: ud2 ; CHECK-32-NOT: subl $2147483647, %esp -; CHECK-32: movl $50000000{{..}}, [[EAX:%e..]] -; CHECK-32-NEXT: addl [[EAX]], %esp +; CHECK-32: ud2 %1 = alloca [5000000000 x i8], align 16 call void @bar(ptr %1) ret void @@ -46,7 +44,7 @@ define i32 @foo3(i32 inreg %x) nounwind { ; CHECK-64-NEXT: subq %rax, %rsp ; CHECK-32-LABEL: foo3: -; CHECK-32: subl $2147483647, %esp +; CHECK-32: ud2 ; CHECK-32-NOT: movl ${{.*}}, %eax %1 = alloca [5000000000 x i8], align 16 call void @bar(ptr %1) diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll index b8031056fd6b0..d9b20f50e9a88 100644 --- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll +++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll @@ -30,44 +30,22 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X86-LABEL: foo: ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: ud2 -; CHECK-X86-NEXT: .cfi_def_cfa_register %eax -; CHECK-X86-NEXT: .cfi_adjust_cfa_offset 4800000000 -; CHECK-X86-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-NEXT: movl $0, (%esp) -; CHECK-X86-NEXT: cmpl %eax, %esp -; CHECK-X86-NEXT: jne .LBB0_1 -; CHECK-X86-NEXT: # %bb.2: -; CHECK-X86-NEXT: subl $12, %esp -; CHECK-X86-NEXT: .cfi_def_cfa_register %esp ; CHECK-X86-NEXT: .cfi_def_cfa_offset 4800000016 ; CHECK-X86-NEXT: movl $1, 392(%esp) ; CHECK-X86-NEXT: movl $1, 28792(%esp) ; CHECK-X86-NEXT: movl (%esp), %eax -; CHECK-X86-NEXT: movl $4800000012, %ecx # imm = 0x11E1A300C -; CHECK-X86-NEXT: addl %ecx, %esp +; CHECK-X86-NEXT: ud2 ; CHECK-X86-NEXT: .cfi_def_cfa_offset 4 ; CHECK-X86-NEXT: retl ; ; CHECK-X32-LABEL: foo: ; CHECK-X32: # %bb.0: ; CHECK-X32-NEXT: ud2 -; CHECK-X32-NEXT: .cfi_def_cfa_register %r11 -; CHECK-X32-NEXT: .cfi_adjust_cfa_offset 4799995904 -; CHECK-X32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X32-NEXT: movq $0, (%esp) -; CHECK-X32-NEXT: cmpl %r11d, %esp -; CHECK-X32-NEXT: jne .LBB0_1 -; CHECK-X32-NEXT: # %bb.2: -; CHECK-X32-NEXT: subl $3976, %esp # imm = 0xF88 -; CHECK-X32-NEXT: .cfi_def_cfa_register %rsp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 4799999888 ; CHECK-X32-NEXT: movl $1, 264(%esp) ; CHECK-X32-NEXT: movl $1, 28664(%esp) ; CHECK-X32-NEXT: movl -128(%esp), %eax -; CHECK-X32-NEXT: movl $4799999880, %ecx # imm = 0x11E1A2F88 -; CHECK-X32-NEXT: addl %ecx, %esp +; CHECK-X32-NEXT: ud2 ; CHECK-X32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X32-NEXT: retq %a = alloca i32, i64 1200000000, align 16 From f607e3fd23ef0019b2f3b289b4d46012400b8db5 Mon Sep 17 00:00:00 2001 From: Valentyn Yukhymenko Date: Sat, 25 Jan 2025 07:01:40 +0000 Subject: [PATCH 078/432] [Clang][Sema] Reject declaring an alias template with the same name as its template parameter. (#123533) The issue occurred because the template parameter scope was skipped too early, before diagnosing the alias name shadowing. To fix this, the patch moves it to after LookupName, such that the behavior remains consistent with the typedef implementation. Fixes llvm#123423 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaDeclCXX.cpp | 5 +++-- .../temp/temp.decls/temp.variadic/fixed-expansion.cpp | 4 ++-- clang/test/SemaCXX/alias-template.cpp | 10 ++++++++-- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index f110b8cf76507..e9fffddd507c6 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -990,6 +990,7 @@ Bug Fixes to C++ Support - Fix immediate escalation not propagating through inherited constructors. (#GH112677) - Fixed assertions or false compiler diagnostics in the case of C++ modules for lambda functions or inline friend functions defined inside templates (#GH122493). +- Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 839b3a1cccdcc..08065e3cad2bb 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -13406,8 +13406,6 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS, SourceLocation UsingLoc, UnqualifiedId &Name, const ParsedAttributesView &AttrList, TypeResult Type, Decl *DeclFromDeclSpec) { - // Get the innermost enclosing declaration scope. - S = S->getDeclParent(); if (Type.isInvalid()) return nullptr; @@ -13458,6 +13456,9 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS, CheckTypedefForVariablyModifiedType(S, NewTD); Invalid |= NewTD->isInvalidDecl(); + // Get the innermost enclosing declaration scope. + S = S->getDeclParent(); + bool Redeclaration = false; NamedDecl *NewND; diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp index a990c82564aa4..ab4c663d24c7d 100644 --- a/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp +++ b/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp @@ -121,8 +121,8 @@ namespace PartialSpecialization { namespace FixedAliasTemplate { template struct S {}; - template using U = S; // expected-note 2{{template parameter is declared here}} - template U &f(U, Ts...); // expected-error 2{{pack expansion used as argument for non-pack parameter of alias template}} + template using Z = S; // expected-note 2{{template parameter is declared here}} + template Z &f(Z, Ts...); // expected-error 2{{pack expansion used as argument for non-pack parameter of alias template}} S &s1 = f({}, 0, 0.0); // expected-error {{no matching function}} } diff --git a/clang/test/SemaCXX/alias-template.cpp b/clang/test/SemaCXX/alias-template.cpp index 5189405e23db5..b49d36a6267e6 100644 --- a/clang/test/SemaCXX/alias-template.cpp +++ b/clang/test/SemaCXX/alias-template.cpp @@ -54,18 +54,24 @@ namespace LookupFilter { template using S = S*; // ok } -namespace InFunctions { +namespace UnexpandedPack { template struct S0 { template using U = T*; // expected-error {{declaration type contains unexpanded parameter pack 'T'}} U u; }; +} +namespace InvalidType { template using T1 = int; template using T2 = int[-1]; // expected-error {{array size is negative}} +} + +namespace ShadowTemplateParam { template struct S3 { // expected-note {{template parameter is declared here}} template using T = int; // expected-error {{declaration of 'T' shadows template parameter}} }; - template using Z = Z; + template // expected-note {{template parameter is declared here}} + using Z = Z; // expected-error {{declaration of 'Z' shadows template parameter}} } namespace ClassNameRedecl { From c216081e981ea14536024b86df79ddee9fe517e0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 01:15:38 -0800 Subject: [PATCH 079/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#124388) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect P to be nonnull. --- clang/lib/AST/DeclTemplate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 40ee3753c2422..2933ba7fb8a29 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -1049,7 +1049,7 @@ ClassTemplateSpecializationDecl::getSourceRange() const { assert(!Pattern.isNull() && "Class template specialization without pattern?"); if (const auto *CTPSD = - Pattern.dyn_cast()) + dyn_cast(Pattern)) return CTPSD->getSourceRange(); return cast(Pattern)->getSourceRange(); } @@ -1773,7 +1773,7 @@ TemplateParameterList *clang::getReplacedTemplateParameterList(Decl *D) { const auto *CTSD = cast(D); auto P = CTSD->getSpecializedTemplateOrPartial(); if (const auto *CTPSD = - P.dyn_cast()) + dyn_cast(P)) return CTPSD->getTemplateParameters(); return cast(P)->getTemplateParameters(); } From 186d6546d9c5898a0a32f4616558021d9a908786 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 01:16:00 -0800 Subject: [PATCH 080/432] [Index] Migrate away from PointerUnion::dyn_cast (NFC) (#124389) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect DclInfo.DeclOrMacro to be nonnull. --- clang/lib/Index/FileIndexRecord.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Index/FileIndexRecord.cpp b/clang/lib/Index/FileIndexRecord.cpp index 449c33637eb7e..cf40a596f5094 100644 --- a/clang/lib/Index/FileIndexRecord.cpp +++ b/clang/lib/Index/FileIndexRecord.cpp @@ -55,7 +55,7 @@ void FileIndexRecord::removeHeaderGuardMacros() { void FileIndexRecord::print(llvm::raw_ostream &OS, SourceManager &SM) const { OS << "DECLS BEGIN ---\n"; for (auto &DclInfo : Decls) { - if (const auto *D = DclInfo.DeclOrMacro.dyn_cast()) { + if (const auto *D = dyn_cast(DclInfo.DeclOrMacro)) { SourceLocation Loc = SM.getFileLoc(D->getLocation()); PresumedLoc PLoc = SM.getPresumedLoc(Loc); OS << llvm::sys::path::filename(PLoc.getFilename()) << ':' From 62bd217b5a1cf6b231b2413b5522533986d4e5df Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 01:16:17 -0800 Subject: [PATCH 081/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC) (#124391) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect DeclOrIterator to be nonnull. --- clang/lib/Sema/SemaCodeComplete.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 58f3efbe0daf8..bc0f6a9435f95 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -678,7 +678,7 @@ class ResultBuilder::ShadowMapEntry::iterator { }*/ reference operator*() const { - if (const NamedDecl *ND = DeclOrIterator.dyn_cast()) + if (const NamedDecl *ND = dyn_cast(DeclOrIterator)) return reference(ND, SingleDeclIndex); return *cast(DeclOrIterator); From 0cc74a8941884d56a4718c28cc5b8ef8dbe17047 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 01:17:22 -0800 Subject: [PATCH 082/432] [CodeGen] Avoid repeated hash lookups (NFC) (#124392) --- llvm/lib/CodeGen/ModuloSchedule.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index d99b6ace01000..f9fe812f7e65c 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -397,8 +397,9 @@ void ModuloScheduleExpander::generateExistingPhis( // The Phi value from the loop body typically is defined in the loop, but // not always. So, we need to check if the value is defined in the loop. unsigned PhiOp2 = LoopVal; - if (VRMap[LastStageNum].count(LoopVal)) - PhiOp2 = VRMap[LastStageNum][LoopVal]; + if (auto It = VRMap[LastStageNum].find(LoopVal); + It != VRMap[LastStageNum].end()) + PhiOp2 = It->second; int StageScheduled = Schedule.getStage(&*BBI); int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal)); @@ -1055,8 +1056,8 @@ void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI, // Make an adjustment to get the last definition. StageNum -= StageDiff; } - if (VRMap[StageNum].count(reg)) - MO.setReg(VRMap[StageNum][reg]); + if (auto It = VRMap[StageNum].find(reg); It != VRMap[StageNum].end()) + MO.setReg(It->second); } } } @@ -1710,8 +1711,8 @@ void PeelingModuloScheduleExpander::moveStageBetweenBlocks( for (MachineOperand &MO : I->uses()) { if (!MO.isReg()) continue; - if (Remaps.count(MO.getReg())) - MO.setReg(Remaps[MO.getReg()]); + if (auto It = Remaps.find(MO.getReg()); It != Remaps.end()) + MO.setReg(It->second); else { // If we are using a phi from the source block we need to add a new phi // pointing to the old one. From 72918fd11dd805b578bbc9c4f36bea3bc96f37b5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 01:17:38 -0800 Subject: [PATCH 083/432] [GlobalISel] Avoid repeated hash lookups (NFC) (#124393) --- llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp index 3a9069848ca1d..0222069cfc576 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp @@ -181,10 +181,7 @@ MachineInstr *GISelCSEInfo::getMachineInstrIfExists(FoldingSetNodeID &ID, void GISelCSEInfo::countOpcodeHit(unsigned Opc) { #ifndef NDEBUG - if (OpcodeHitTable.count(Opc)) - OpcodeHitTable[Opc] += 1; - else - OpcodeHitTable[Opc] = 1; + ++OpcodeHitTable[Opc]; #endif // Else do nothing. } From 84d4037488f5b366e76be4fe723e0de7aeee264d Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 25 Jan 2025 10:12:19 +0100 Subject: [PATCH 084/432] Reapply "[libc++] Fix tests for clang::no_specializations for C++17 and C++20" The missing diagnostic pragmas have been added. This reverts commit 8a6b44bf4cfe5df3db687a6b9519e99dbce8cf54. --- libcxx/include/__type_traits/result_of.h | 7 +++++- .../ranges/no_specializations.verify.cpp | 4 +++- .../type_traits/no_specializations.verify.cpp | 24 ++++++++++++------- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h index 217ca70b4cd20..e6adec7f9c978 100644 --- a/libcxx/include/__type_traits/result_of.h +++ b/libcxx/include/__type_traits/result_of.h @@ -22,10 +22,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS) template -struct _LIBCPP_DEPRECATED_IN_CXX17 result_of; +struct _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS result_of; +_LIBCPP_DIAGNOSTIC_PUSH +#if __has_warning("-Winvalid-specialization") +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") +#endif template struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {}; +_LIBCPP_DIAGNOSTIC_POP # if _LIBCPP_STD_VER >= 14 template diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp index 69d458a920558..489e3a6a73744 100644 --- a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp @@ -13,7 +13,9 @@ #include -#if !__has_warning("-Winvalid-specialization") +#include "test_macros.h" + +#if !__has_warning("-Winvalid-specialization") || TEST_STD_VER <= 20 // expected-no-diagnostics #else struct S {}; diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index e6d960667e8c0..807d01e381b49 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -36,15 +36,22 @@ SPECIALIZE_TRAIT(make_unsigned); // expected-error {{cannot be specialize SPECIALIZE_TRAIT(remove_all_extents); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_const); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_cv); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_extent); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_pointer); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_reference); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(remove_volatile); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} SPECIALIZE_TRAIT(underlying_type); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} -SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} + +# if TEST_STD_VER <= 17 +SPECIALIZE_TRAIT(result_of); // expected-error {{cannot be specialized}} +# endif + +# if TEST_STD_VER >= 20 +SPECIALIZE_TRAIT(remove_cvref); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(type_identity); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}} +SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}} +# endif # undef SPECIALIZE_TRAIT # define SPECIALIZE_UTT(Trait) \ @@ -96,7 +103,6 @@ SPECIALIZE_UTT(is_move_assignable); // expected-error 2 {{cannot SPECIALIZE_UTT(is_move_constructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_BTT(is_nothrow_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_constructible); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_copy_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_copy_constructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_nothrow_default_constructible); // expected-error 2 {{cannot be specialized}} @@ -130,7 +136,6 @@ SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot SPECIALIZE_UTT(is_trivially_destructible); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_trivially_move_assignable); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_trivially_move_constructible); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_union); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_unsigned); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_void); // expected-error 2 {{cannot be specialized}} @@ -140,11 +145,12 @@ SPECIALIZE_UTT(rank); // expected-error 2 {{cannot # if TEST_STD_VER <= 17 SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(result_of); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 20 -SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 23 @@ -171,6 +177,8 @@ struct std::conditional; // expected-error {{cannot be specialized}} template <> struct std::enable_if; // expected-error {{cannot be specialized}} +#if TEST_STD_VER >= 20 template <> struct std::integral_constant; // expected-error {{cannot be specialized}} #endif +#endif From 7974f12b1e3682514bd58b35c5a784f35938fa04 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sat, 25 Jan 2025 18:22:14 +0900 Subject: [PATCH 085/432] [HLSL] Suppress a warning in #122820 [-Wunused-but-set-variable] --- clang/lib/Sema/SemaHLSL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index a7033cb54886a..aa99b44958eaf 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -425,6 +425,7 @@ static CXXRecordDecl *createHostLayoutStruct(Sema &S, // copy base struct, create HLSL Buffer compatible version if needed if (unsigned NumBases = StructDecl->getNumBases()) { assert(NumBases == 1 && "HLSL supports only one base type"); + (void)NumBases; CXXBaseSpecifier Base = *StructDecl->bases_begin(); CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl(); if (requiresImplicitBufferLayoutStructure(BaseDecl)) { From 2696e4fb9567d23ce065a067e7f4909b310daf50 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 25 Jan 2025 10:36:43 +0100 Subject: [PATCH 086/432] [libc++] Reduce std::conjunction overhead (#124259) The old and new implementation of `_And` are very close in terms of performance according to my testing, but the new implementation can also be used to implement `conjunction`, which make that ~50% faster. --- libcxx/include/__type_traits/conjunction.h | 42 ++++++++++------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h index ad9656acd47ec..6b6717a50a468 100644 --- a/libcxx/include/__type_traits/conjunction.h +++ b/libcxx/include/__type_traits/conjunction.h @@ -10,8 +10,6 @@ #define _LIBCPP___TYPE_TRAITS_CONJUNCTION_H #include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_same.h> @@ -21,22 +19,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template -using __expand_to_true _LIBCPP_NODEBUG = true_type; +template +struct _AndImpl; -template -__expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int); +template <> +struct _AndImpl { + template + using _Result _LIBCPP_NODEBUG = + typename _AndImpl::template _Result<_First, _Rest...>; +}; -template -false_type __and_helper(...); +template <> +struct _AndImpl { + template + using _Result _LIBCPP_NODEBUG = _Res; +}; // _And always performs lazy evaluation of its arguments. // // However, `_And<_Pred...>` itself will evaluate its result immediately (without having to // be instantiated) since it is an alias, unlike `conjunction<_Pred...>`, which is a struct. // If you want to defer the evaluation of `_And<_Pred...>` itself, use `_Lazy<_And, _Pred...>`. -template -using _And _LIBCPP_NODEBUG = decltype(std::__and_helper<_Pred...>(0)); +template +using _And _LIBCPP_NODEBUG = typename _AndImpl::template _Result; template struct __all_dummy; @@ -46,22 +51,11 @@ struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)... #if _LIBCPP_STD_VER >= 17 -template -struct _LIBCPP_NO_SPECIALIZATIONS conjunction : true_type {}; - -_LIBCPP_DIAGNOSTIC_PUSH -# if __has_warning("-Winvalid-specialization") -_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization") -# endif -template -struct conjunction<_Arg> : _Arg {}; - -template -struct conjunction<_Arg, _Args...> : conditional_t> {}; -_LIBCPP_DIAGNOSTIC_POP +template +struct _LIBCPP_NO_SPECIALIZATIONS conjunction : _And<_Args...> {}; template -_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = conjunction<_Args...>::value; +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = _And<_Args...>::value; #endif // _LIBCPP_STD_VER >= 17 From 52bffdf9f5bb72eb86249a012d08a40c90316dfb Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 25 Jan 2025 10:59:50 +0000 Subject: [PATCH 087/432] [IPSCCP][FuncSpec] Protect against metadata access from call args. (#124284) Fixes an issue reported from #114964, where metadata arguments were attempted to be accessed as constants. --- .../Transforms/IPO/FunctionSpecialization.cpp | 2 ++ .../solver-constant-strictfpmetadata.ll | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 449d64d1614ff..c13305ce5056d 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -415,6 +415,8 @@ Constant *InstCostVisitor::visitCallBase(CallBase &I) { for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) { Value *V = I.getOperand(Idx); + if (isa(V)) + return nullptr; Constant *C = findConstantFor(V); if (!C) return nullptr; diff --git a/llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll b/llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll new file mode 100644 index 0000000000000..99224b4efba6b --- /dev/null +++ b/llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=ipsccp -force-specialization -S < %s | FileCheck %s + +define float @test(ptr %this, float %cm, i1 %0) strictfp { +; CHECK-LABEL: define float @test( +; CHECK-SAME: ptr [[THIS:%.*]], float [[CM:%.*]], i1 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[CM]], float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") +; CHECK-NEXT: [[CALL295:%.*]] = call float @test.specialized.1(ptr null, float 0.000000e+00, i1 false) +; CHECK-NEXT: ret float 0.000000e+00 +; +entry: + %cmp = call i1 @llvm.experimental.constrained.fcmps.f32(float %cm, float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #0 + %call295 = call float @test(ptr null, float 0.000000e+00, i1 false) #0 + ret float 0.000000e+00 +} + From 1a53d4baeb0242e00c494fd0a2b2ce58bcbf28b6 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Sat, 25 Jan 2025 03:59:45 -0800 Subject: [PATCH 088/432] [clang][cmake] Apply bolt optimizations as part of the clang target (#119896) This change removes the need to call the clang-bolt target in order to apply bolt optimizations to clang. Now running `ninja clang` will build a clang with bolt optimizations, and `ninja check-clang` and `ninja install-clang` will test and install bolt optimized clang too. The clang-bolt target has been kept for compatibilty reasons, but it is now just an alias to the clang target. Also, this new design for applying the bolt optimizations to clang will be easier to generalize and use to optimize other binaries/libraries in the project. --------- Co-authored-by: Amir Ayupov Co-authored-by: Petr Hosek --- clang/CMakeLists.txt | 52 ------------- clang/tools/driver/CMakeLists.txt | 52 +++++++++++++ clang/utils/perf-training/CMakeLists.txt | 26 +------ clang/utils/perf-training/perf-helper.py | 98 ++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 77 deletions(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index b79e570667b2c..cacbf2ebf868f 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -872,58 +872,6 @@ if (CLANG_ENABLE_BOOTSTRAP) endforeach() endif() -set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \ - May be specified as Instrument or Perf or LBR to use a particular profiling \ - mechanism.") -string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT) - -if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) - set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) - set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED}) - set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata) - - # Pass extra flag in no-LBR mode - if (CLANG_BOLT STREQUAL "PERF") - set(BOLT_NO_LBR "-nl") - endif() - - if (CLANG_BOLT STREQUAL "INSTRUMENT") - # Instrument clang with BOLT - add_custom_target(clang-instrumented - DEPENDS ${CLANG_INSTRUMENTED} - ) - add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} - DEPENDS clang llvm-bolt - COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} - -instrument --instrumentation-file-append-pid - --instrumentation-file=${BOLT_FDATA} - COMMENT "Instrumenting clang binary with BOLT" - USES_TERMINAL - VERBATIM - ) - add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented) - else() # perf or LBR - add_custom_target(clang-bolt-training-deps DEPENDS clang) - endif() - - # Optimize original (pre-bolt) Clang using the collected profile - add_custom_target(clang-bolt - DEPENDS clang-bolt-profile - COMMAND ${CMAKE_COMMAND} -E rename $ ${CLANG_PATH}-prebolt - COMMAND ${CMAKE_COMMAND} -E create_symlink ${CLANG_PATH}-prebolt ${CLANG_PATH}++-prebolt - COMMAND llvm-bolt ${CLANG_PATH}-prebolt - -o $ - -data ${BOLT_FDATA} - -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions - -split-all-cold -split-eh -dyno-stats -use-gnu-stack - -update-debug-sections - ${BOLT_NO_LBR} - COMMENT "Optimizing Clang with BOLT" - USES_TERMINAL - VERBATIM - ) -endif() - if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION) add_subdirectory(utils/ClangVisualizers) endif() diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index a4debc2dd2e89..ad336fcc45b60 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -23,6 +23,18 @@ if(CLANG_PLUGIN_SUPPORT) set(support_plugins SUPPORT_PLUGINS) endif() +set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \ + May be specified as Instrument or Perf or LBR to use a particular profiling \ + mechanism.") +string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT) + +if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) + set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj) + if (NOT CLANG_BOLT STREQUAL "INSTRUMENT") + list(APPEND CLANG_BOLT_DEPS clear-perf-data) + endif() +endif() + add_clang_tool(clang driver.cpp cc1_main.cpp @@ -35,6 +47,7 @@ add_clang_tool(clang ARMTargetParserTableGen AArch64TargetParserTableGen ${support_plugins} + ${CLANG_BOLT_DEPS} GENERATE_DRIVER ) @@ -134,3 +147,42 @@ if(CLANG_ORDER_FILE AND set_target_properties(clang PROPERTIES LINK_DEPENDS ${CLANG_ORDER_FILE}) endif() endif() + +if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) + # Add a clang-bolt target for backwards compatibility. + add_custom_target(clang-bolt DEPENDS clang) + + set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING + "Name of BOLT-instrumented Clang binary") + set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED}) + set(PERF_TRAINING_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../utils/perf-training) + set(BOLT_FDATA ${PERF_TRAINING_BINARY_DIR}/prof.fdata) + get_llvm_lit_path( + lit_base_dir + lit_file_name + ALLOW_EXTERNAL + ) + set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}") + + # This POST_BUILD command is executed unconditionally even if the clang target + # is already built. We need to wrap the whole bolt optimization process in + # a single python wrapper, so that we can first check if the binary has + # already been optimized and then exit early with a 0 status if it has. + add_custom_command( + TARGET clang POST_BUILD + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py + bolt-optimize + --method ${CLANG_BOLT} + --input $ + --instrumented-output ${CLANG_INSTRUMENTED} + --fdata ${BOLT_FDATA} + --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR} + --readelf $ + --bolt $ + --lit "${LIT_COMMAND}" + --merge-fdata $ + COMMENT "Optimizing Clang with BOLT" + USES_TERMINAL + VERBATIM + ) +endif() diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt index 49673790ff6e8..4aed086563ee9 100644 --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -83,8 +83,6 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD) endif() if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) - set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING - "Name of BOLT-instrumented Clang binary") configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg @@ -93,7 +91,7 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang" ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/ EXCLUDE_FROM_CHECK_ALL - DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data + DEPENDS clear-bolt-fdata clear-perf-data ) add_custom_target(clear-bolt-fdata @@ -104,26 +102,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data COMMENT "Clearing old perf data") - string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT) - if (CLANG_BOLT STREQUAL "LBR") - set(BOLT_LBR "--lbr") - endif() - - add_custom_target(merge-fdata-deps) - if (CLANG_BOLT STREQUAL "INSTRUMENT") - add_dependencies(merge-fdata-deps generate-bolt-fdata) - else() - # Convert perf profiles into fdata - add_custom_target(convert-perf-fdata - COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $ ${CMAKE_CURRENT_BINARY_DIR} $ ${BOLT_LBR} - COMMENT "Converting perf files to BOLT fdata" - DEPENDS llvm-bolt generate-bolt-fdata) - add_dependencies(merge-fdata-deps convert-perf-fdata) - endif() - - # Merge profiles into one using merge-fdata - add_custom_target(clang-bolt-profile - COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $ ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Merging BOLT fdata" - DEPENDS merge-fdata merge-fdata-deps) endif() diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index d76c6ede3fe5a..55c5160a71c4f 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -16,6 +16,8 @@ import bisect import shlex import tempfile +import re +import shutil test_env = {"PATH": os.environ["PATH"]} @@ -558,7 +560,103 @@ def genOrderFile(args): return 0 +def bolt_optimize(args): + parser = argparse.ArgumentParser("%prog [options] ") + parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"]) + parser.add_argument("--input") + parser.add_argument("--instrumented-output") + parser.add_argument("--fdata") + parser.add_argument("--perf-training-binary-dir") + parser.add_argument("--readelf") + parser.add_argument("--bolt") + parser.add_argument("--lit") + parser.add_argument("--merge-fdata") + + opts = parser.parse_args(args) + + output = subprocess.check_output( + [opts.readelf, "-WS", opts.input], universal_newlines=True + ) + + # This binary has already been bolt-optimized, so skip further processing. + if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE): + return 0 + + if opts.method == "INSTRUMENT": + process = subprocess.run( + [ + opts.bolt, + opts.input, + "-o", + opts.instrumented_output, + "-instrument", + "--instrumentation-file-append-pid", + f"--instrumentation-file={opts.fdata}", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + print(process.args) + for line in process.stdout: + sys.stdout.write(line) + process.check_returncode() + + process = subprocess.run( + [ + sys.executable, + opts.lit, + os.path.join(opts.perf_training_binary_dir, "bolt-fdata"), + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + print(process.args) + for line in process.stdout: + sys.stdout.write(line) + process.check_returncode() + + if opts.method in ["PERF", "LBR"]: + perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input]) + + merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir]) + + shutil.copy(opts.input, f"{opts.input}-prebolt") + + process = subprocess.run( + [ + opts.bolt, + f"{opts.input}-prebolt", + "-o", + opts.input, + "-data", + opts.fdata, + "-reorder-blocks=ext-tsp", + "-reorder-functions=cdsort", + "-split-functions", + "-split-all-cold", + "-split-eh", + "-dyno-stats", + "-use-gnu-stack", + "-update-debug-sections", + "-nl" if opts.method == "PERF" else "", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + print(process.args) + for line in process.stdout: + sys.stdout.write(line) + process.check_returncode() + + commands = { + "bolt-optimize": bolt_optimize, "clean": clean, "merge": merge, "dtrace": dtrace, From de5ff8ad07ae824b86c5cefcba63f4b66607b759 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 25 Jan 2025 13:08:00 +0100 Subject: [PATCH 089/432] [libc++][test] Improves C++ Standard filtering. (#89499) Adds a new lit directive to improve C++ Standard filtering. This is based on the [Discourse](https://discourse.llvm.org/t/rfc-improving-c-standard-filtering-in-the-lit-tests/78474) discussion. --- libcxx/docs/TestingLibcxx.rst | 32 +++++++++++++++++++ .../print.fun/includes.compile.pass.cpp | 2 +- .../print.fun/no_file_description.pass.cpp | 2 +- .../locale.stdcvt/depr.verify.cpp | 2 +- .../conversions.buffer/depr.verify.cpp | 2 +- .../conversions.string/depr.verify.cpp | 2 +- .../reserve.deprecated_in_cxx20.verify.cpp | 2 +- libcxx/utils/libcxx/test/params.py | 3 +- 8 files changed, 40 insertions(+), 7 deletions(-) diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index e98b96bfb478f..4da7f3e85d291 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -435,6 +435,38 @@ writing tests easier. See `libc++-specific Lit Directives`_ for more information extension.) +C++ Standard version tests +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Historically libc++ tests used to filter the tests for C++ Standard versions +with lit directives like: + +.. code-block:: cpp + + // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 + +With C++ Standards released every 3 years, this solution is not scalable. +Instead use: + +.. code-block:: cpp + + // UNSUPPORTED: std-at-least-c++26 + +There is no corresponding ``std-at-most-c++23``. This could be useful when +tests are only valid for a small set of standard versions. For example, a +deprecation test is only valid when the feature is deprecated until it is +removed from the Standard. These tests should be written like: + +.. code-block:: cpp + + // REQUIRES: c++17 || c++20 || c++23 + +.. note:: + + There are a lot of tests with the first style, these can remain as they are. + The new style is only intended to be used for new tests. + + Benchmarks ========== diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp index 9b9b0e404e6b7..38e4e4d3fb9ef 100644 --- a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// REQUIRES: std-at-least-c++23 // UNSUPPORTED: no-filesystem // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp index d3e4463fe0bc8..5561a1a8b3334 100644 --- a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// REQUIRES: std-at-least-c++23 // UNSUPPORTED: no-filesystem // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME diff --git a/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp b/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp index b3c6fc8674f8a..7bdcaa5190bd0 100644 --- a/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp +++ b/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14, c++26 +// REQUIRES: c++17 || c++20 || c++23 // UNSUPPORTED: no-wide-characters // diff --git a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp index cb067e99a4764..dcab5cef3a550 100644 --- a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp +++ b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14, c++26 +// REQUIRES: c++17 || c++20 || c++23 // XFAIL: no-wide-characters diff --git a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp index f8bd156bdd5f6..6eab4a5dd9223 100644 --- a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp +++ b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp @@ -8,7 +8,7 @@ // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT -// UNSUPPORTED: c++03, c++11, c++14, c++26 +// REQUIRES: c++17 || c++20 || c++23 // UNSUPPORTED: no-wide-characters // diff --git a/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp b/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp index 81edd9b83d184..87b56c06b9512 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp @@ -10,7 +10,7 @@ // void reserve(); // Deprecated in C++20 -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++26 +// REQUIRES: c++20 || c++23 #include diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index 947cfd2651364..8fd3872cd8cbb 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -186,7 +186,8 @@ def getSuitableClangTidy(cfg): AddFeature(std), AddSubstitution("%{cxx_std}", re.sub(r"\+", "x", std)), AddCompileFlag(lambda cfg: getStdFlag(cfg, std)), - ], + ] + + [AddFeature(f"std-at-least-{s}") for s in _allStandards if s <= std], ), Parameter( name="optimization", From 6383a12e3b4339fa4743bb97da4d51dea6d2e2ea Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 25 Jan 2025 13:32:00 +0000 Subject: [PATCH 090/432] [VPlan] Refactor HCFG builder to preserve original vector latch (NFC). Update HCFG builder to preserve the original latch block of the initial VPlan, ensuring there is always a latch. It also skips creating the BranchOnCond for the latch of the top-level loop, instead of removing it later. Exiting via the latch is controlled by later recipes. This further unifies HCFG construction and prepares for use to also build an initial VPlan (VPlan0) for inner loops. --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 3 ++ .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 17 +++++-- .../vplan-printing-outer-loop.ll | 8 +-- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 49 +++++++++++-------- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 49694eb68e25b..3a4f637f177e1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9509,12 +9509,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, *PSE.getSE(), *TLI); - // Remove the existing terminator of the exiting block of the top-most region. - // A BranchOnCount will be added instead when adding the canonical IV recipes. - auto *Term = - Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); - Term->eraseFromParent(); - // Tail folding is not supported for outer loops, so the induction increment // is guaranteed to not wrap. bool HasNUW = true; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4159a71469bd1..83c54a9b9c259 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1628,6 +1628,9 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, VFRange SubRange = {VF, MaxVFTimes2}; auto Plan = buildVPlan(SubRange); VPlanTransforms::optimize(*Plan); + // Update the name of the latch of the top-level vector loop region region + // after optimizations which includes block folding. + Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch"); VPlans.push_back(std::move(Plan)); VF = SubRange.End; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 0f3aa8d08e7b8..32723e5db9c45 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -292,6 +292,11 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, "Instruction shouldn't have been visited."); if (auto *Br = dyn_cast(Inst)) { + if (TheLoop->getLoopLatch() == BB || + any_of(successors(BB), + [this](BasicBlock *Succ) { return !TheLoop->contains(Succ); })) + continue; + // Conditional branch instruction are represented using BranchOnCond // recipes. if (Br->isConditional()) { @@ -356,11 +361,6 @@ void PlainCFGBuilder::buildPlainCFG() { VPBasicBlock *VectorLatchVPBB = TheRegion->getExitingBasicBlock(); BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB; VectorHeaderVPBB->clearSuccessors(); - VectorLatchVPBB->clearPredecessors(); - if (TheLoop->getHeader() != TheLoop->getLoopLatch()) - BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB; - else - TheRegion->setExiting(VectorHeaderVPBB); // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for @@ -398,6 +398,13 @@ void PlainCFGBuilder::buildPlainCFG() { setRegionPredsFromBB(Region, BB); } + if (TheLoop->getLoopLatch() == BB) { + VPBB->setOneSuccessor(VectorLatchVPBB); + VectorLatchVPBB->clearPredecessors(); + VectorLatchVPBB->setPredecessors({VPBB}); + continue; + } + // Set VPBB successors. We create empty VPBBs for successors if they don't // exist already. Recipes will be created when the successor is visited // during the RPO traversal. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll index 2adeb5920cb5b..52b2bcd9aac11 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll @@ -35,12 +35,14 @@ define void @foo(i64 %n) { ; CHECK-NEXT: EMIT branch-on-cond ir<%inner.ec> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): vector.latch +; CHECK-NEXT: Successor(s): outer.latch ; CHECK-EMPTY: -; CHECK-NEXT: vector.latch: +; CHECK-NEXT: outer.latch: ; CHECK-NEXT: EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1> ; CHECK-NEXT: EMIT ir<%outer.ec> = icmp ir<%outer.iv.next>, ir<8> -; CHECK-NEXT: EMIT branch-on-cond ir<%outer.ec> +; CHECK-NEXT: Successor(s): vector.latch +; CHECK-EMPTY: +; CHECK-NEXT: vector.latch: ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 19c2483d34ed1..dcdaf008e10fe 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -48,16 +48,19 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { EXPECT_EQ(0u, Entry->getNumPredecessors()); EXPECT_EQ(1u, Entry->getNumSuccessors()); - // Check that the region following the preheader is a single basic-block - // region (loop). + // Check that the region following the preheader consists of a block for the + // original header and a separate latch. VPBasicBlock *VecBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); - EXPECT_EQ(8u, VecBB->size()); + EXPECT_EQ(7u, VecBB->size()); EXPECT_EQ(0u, VecBB->getNumPredecessors()); - EXPECT_EQ(0u, VecBB->getNumSuccessors()); + EXPECT_EQ(1u, VecBB->getNumSuccessors()); EXPECT_EQ(VecBB->getParent()->getEntryBasicBlock(), VecBB); - EXPECT_EQ(VecBB->getParent()->getExitingBasicBlock(), VecBB); EXPECT_EQ(&*Plan, VecBB->getPlan()); + VPBlockBase *VecLatch = VecBB->getSingleSuccessor(); + EXPECT_EQ(VecLatch->getParent()->getExitingBasicBlock(), VecLatch); + EXPECT_EQ(0u, VecLatch->getNumSuccessors()); + auto Iter = VecBB->begin(); VPWidenPHIRecipe *Phi = dyn_cast(&*Iter++); EXPECT_NE(nullptr, Phi); @@ -127,29 +130,33 @@ compound=true " EMIT store ir\<%res\>, ir\<%arr.idx\>\l" + " EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\>, ir\<1\>\l" + " EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\>, ir\<%N\>\l" + - " EMIT branch-on-cond ir\<%exitcond\>\l" + + "Successor(s): vector.latch\l" + ] + N2 -> N4 [ label=""] + N4 [label = + "vector.latch:\l" + "No successors\l" ] } - N2 -> N4 [ label="" ltail=cluster_N3] - N4 [label = + N4 -> N5 [ label="" ltail=cluster_N3] + N5 [label = "middle.block:\l" + " EMIT vp\<%cmp.n\> = icmp eq ir\<%N\>, vp\<%0\>\l" + " EMIT branch-on-cond vp\<%cmp.n\>\l" + "Successor(s): ir-bb\, scalar.ph\l" ] - N4 -> N5 [ label="T"] - N4 -> N6 [ label="F"] - N5 [label = + N5 -> N6 [ label="T"] + N5 -> N7 [ label="F"] + N6 [label = "ir-bb\:\l" + "No successors\l" ] - N6 [label = + N7 [label = "scalar.ph:\l" + "Successor(s): ir-bb\\l" ] - N6 -> N7 [ label=""] - N7 [label = + N7 -> N8 [ label=""] + N8 [label = "ir-bb\:\l" + " IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\l" + " IR %arr.idx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\l" + @@ -204,14 +211,17 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { EXPECT_EQ(0u, Entry->getNumPredecessors()); EXPECT_EQ(1u, Entry->getNumSuccessors()); - // Check that the region following the preheader is a single basic-block - // region (loop). + // Check that the region following the preheader consists of a block for the + // original header and a separate latch. VPBasicBlock *VecBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); - EXPECT_EQ(8u, VecBB->size()); + EXPECT_EQ(7u, VecBB->size()); EXPECT_EQ(0u, VecBB->getNumPredecessors()); - EXPECT_EQ(0u, VecBB->getNumSuccessors()); + EXPECT_EQ(1u, VecBB->getNumSuccessors()); EXPECT_EQ(VecBB->getParent()->getEntryBasicBlock(), VecBB); - EXPECT_EQ(VecBB->getParent()->getExitingBasicBlock(), VecBB); + + VPBlockBase *VecLatch = VecBB->getSingleSuccessor(); + EXPECT_EQ(VecLatch->getParent()->getExitingBasicBlock(), VecLatch); + EXPECT_EQ(0u, VecLatch->getNumSuccessors()); auto Iter = VecBB->begin(); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); @@ -221,7 +231,6 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); - EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_EQ(VecBB->end(), Iter); } From 9325a61aa0960595c22867799ebd157c8160fd86 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Sat, 25 Jan 2025 10:16:37 -0500 Subject: [PATCH 091/432] Revert "[GlobalMerge][NFC] Skip sorting by profitability when it is not needed" (#124411) Reverts llvm/llvm-project#124146 -- new comparator is not a strict-weak as required by stable_sort. Co-authored-by: Michael Maitland --- llvm/lib/CodeGen/GlobalMerge.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 41e01a1d3ccd5..7b76155b175d1 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -423,12 +423,24 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, } } + // Now we found a bunch of sets of globals used together. We accumulated + // the number of times we encountered the sets (i.e., the number of functions + // that use that exact set of globals). + // + // Multiply that by the size of the set to give us a crude profitability + // metric. + llvm::stable_sort(UsedGlobalSets, + [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) { + return UGS1.Globals.count() * UGS1.UsageCount < + UGS2.Globals.count() * UGS2.UsageCount; + }); + // We can choose to merge all globals together, but ignore globals never used // with another global. This catches the obviously non-profitable cases of // having a single global, but is aggressive enough for any other case. if (GlobalMergeIgnoreSingleUse) { BitVector AllGlobals(Globals.size()); - for (const UsedGlobalSet &UGS : UsedGlobalSets) { + for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) { if (UGS.UsageCount == 0) continue; if (UGS.Globals.count() > 1) @@ -437,16 +449,6 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, return doMerge(Globals, AllGlobals, M, isConst, AddrSpace); } - // Now we found a bunch of sets of globals used together. We accumulated - // the number of times we encountered the sets (i.e., the number of functions - // that use that exact set of globals). Multiply that by the size of the set - // to give us a crude profitability metric. - llvm::stable_sort(UsedGlobalSets, - [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) { - return UGS1.Globals.count() * UGS1.UsageCount >= - UGS2.Globals.count() * UGS2.UsageCount; - }); - // Starting from the sets with the best (=biggest) profitability, find a // good combination. // The ideal (and expensive) solution can only be found by trying all @@ -456,7 +458,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, BitVector PickedGlobals(Globals.size()); bool Changed = false; - for (const UsedGlobalSet &UGS : UsedGlobalSets) { + for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) { if (UGS.UsageCount == 0) continue; if (PickedGlobals.anyCommon(UGS.Globals)) From 21f04b1458c52ba875a23b58b02cf6b1f8db0661 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Sat, 25 Jan 2025 16:28:21 +0100 Subject: [PATCH 092/432] Hold a queue of iterator ranges (not operations) in wouldOpBeTriviallyDead (#123642) Ranges let us push the whole blocks onto the queue in constant time. If one of the first ops in the block is side-effecting we'll be able to provide the answer quickly. The previous implementation had to walk the block and queue all the operations only to start traversing them again, which was a considerable slowdown for compile times of large MLIR programs in our benchmarks. --------- Co-authored-by: Jacques Pienaar --- mlir/lib/Interfaces/SideEffectInterfaces.cpp | 23 +++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Interfaces/SideEffectInterfaces.cpp b/mlir/lib/Interfaces/SideEffectInterfaces.cpp index c9feb001a1984..59fd19310cea5 100644 --- a/mlir/lib/Interfaces/SideEffectInterfaces.cpp +++ b/mlir/lib/Interfaces/SideEffectInterfaces.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/SymbolTable.h" #include "llvm/ADT/SmallPtrSet.h" +#include using namespace mlir; @@ -41,10 +42,18 @@ bool mlir::isOpTriviallyDead(Operation *op) { /// allows for marking region operations as trivially dead without always being /// conservative of terminators. static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) { - // The set of operations to consider when checking for side effects. - SmallVector effectingOps(1, rootOp); + // The set of operation intervals (end-exclusive) to consider when checking + // for side effects. + SmallVector, 1> effectingOps = { + std::make_pair(Block::iterator(rootOp), ++Block::iterator(rootOp))}; while (!effectingOps.empty()) { - Operation *op = effectingOps.pop_back_val(); + Block::iterator &it = effectingOps.back().first; + Block::iterator end = effectingOps.back().second; + if (it == end) { + effectingOps.pop_back(); + continue; + } + mlir::Operation *op = &*(it++); // If the operation has recursive effects, push all of the nested operations // on to the stack to consider. @@ -53,8 +62,7 @@ static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) { if (hasRecursiveEffects) { for (Region ®ion : op->getRegions()) { for (auto &block : region) { - for (auto &nestedOp : block) - effectingOps.push_back(&nestedOp); + effectingOps.push_back(std::make_pair(block.begin(), block.end())); } } } @@ -86,10 +94,9 @@ static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) { return false; } continue; - - // Otherwise, if the op has recursive side effects we can treat the - // operation itself as having no effects. } + // Otherwise, if the op only has recursive side effects we can treat the + // operation itself as having no effects. We will visit its children next. if (hasRecursiveEffects) continue; From 5cb2db3b51c2a9d516d57bd2f07d9899bd5fdae7 Mon Sep 17 00:00:00 2001 From: vporpo Date: Sat, 25 Jan 2025 08:19:27 -0800 Subject: [PATCH 093/432] [SandboxVec][Scheduler] Forbid crossing BBs (#124369) This patch updates the scheduler to forbid scheduling across BBs. It should eventually be able to handle this, but we disable it for now. --- .../Vectorize/SandboxVectorizer/Scheduler.h | 6 ++- .../Vectorize/SandboxVectorizer/Scheduler.cpp | 8 ++- .../SandboxVectorizer/SchedulerTest.cpp | 52 +++++++++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h index 25432e1396c73..0da1894c90613 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h @@ -122,6 +122,8 @@ class Scheduler { std::optional ScheduleTopItOpt; // TODO: This is wasting memory in exchange for fast removal using a raw ptr. DenseMap> Bndls; + /// The BB that we are currently scheduling. + BasicBlock *ScheduledBB = nullptr; /// \Returns a scheduling bundle containing \p Instrs. SchedBundle *createBundle(ArrayRef Instrs); @@ -166,8 +168,10 @@ class Scheduler { DAG.clear(); ReadyList.clear(); ScheduleTopItOpt = std::nullopt; + ScheduledBB = nullptr; assert(Bndls.empty() && DAG.empty() && ReadyList.empty() && - !ScheduleTopItOpt && "Expected empty state!"); + !ScheduleTopItOpt && ScheduledBB == nullptr && + "Expected empty state!"); } #ifndef NDEBUG diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp index 496521b95a98e..06c1ef6b6d5ae 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp @@ -189,7 +189,13 @@ bool Scheduler::trySchedule(ArrayRef Instrs) { [Instrs](Instruction *I) { return I->getParent() == (*Instrs.begin())->getParent(); }) && - "Instrs not in the same BB!"); + "Instrs not in the same BB, should have been rejected by Legality!"); + if (ScheduledBB == nullptr) + ScheduledBB = Instrs[0]->getParent(); + // We don't support crossing BBs for now. + if (any_of(Instrs, + [this](Instruction *I) { return I->getParent() != ScheduledBB; })) + return false; auto SchedState = getBndlSchedState(Instrs); switch (SchedState) { case BndlSchedState::FullyScheduled: diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp index c5e44a97976a7..5a2b92ed24b03 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp @@ -51,6 +51,14 @@ struct SchedulerTest : public testing::Test { } }; +static sandboxir::BasicBlock *getBasicBlockByName(sandboxir::Function *F, + StringRef Name) { + for (sandboxir::BasicBlock &BB : *F) + if (BB.getName() == Name) + return &BB; + llvm_unreachable("Expected to find basic block!"); +} + TEST_F(SchedulerTest, SchedBundle) { parseIR(C, R"IR( define void @foo(ptr %ptr, i8 %v0, i8 %v1) { @@ -237,3 +245,47 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) { EXPECT_TRUE(Sched.trySchedule({Add0, Add1})); EXPECT_TRUE(Sched.trySchedule({L0, L1})); } + +TEST_F(SchedulerTest, DontCrossBBs) { + parseIR(C, R"IR( +define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) { +bb0: + %add0 = add i8 %v0, 0 + %add1 = add i8 %v1, 1 + br label %bb1 +bb1: + store i8 %add0, ptr %ptr0 + store i8 %add1, ptr %ptr1 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB0 = getBasicBlockByName(F, "bb0"); + auto *BB1 = getBasicBlockByName(F, "bb1"); + auto It = BB0->begin(); + auto *Add0 = &*It++; + auto *Add1 = &*It++; + + It = BB1->begin(); + auto *S0 = cast(&*It++); + auto *S1 = cast(&*It++); + auto *Ret = cast(&*It++); + + { + // Schedule bottom-up + sandboxir::Scheduler Sched(getAA(*LLVMF), Ctx); + EXPECT_TRUE(Sched.trySchedule({Ret})); + EXPECT_TRUE(Sched.trySchedule({S0, S1})); + // Scheduling across blocks should fail. + EXPECT_FALSE(Sched.trySchedule({Add0, Add1})); + } + { + // Schedule top-down + sandboxir::Scheduler Sched(getAA(*LLVMF), Ctx); + EXPECT_TRUE(Sched.trySchedule({Add0, Add1})); + // Scheduling across blocks should fail. + EXPECT_FALSE(Sched.trySchedule({S0, S1})); + } +} From 485b1ac8a265dcf19c55a98aeefff95158cc63a2 Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Fri, 24 Jan 2025 14:47:09 -0800 Subject: [PATCH 094/432] [SandboxIR][Docs] C++ highlighting for code block --- llvm/docs/SandboxIR.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/SandboxIR.md b/llvm/docs/SandboxIR.md index 61bae4e36ef43..735190e19966e 100644 --- a/llvm/docs/SandboxIR.md +++ b/llvm/docs/SandboxIR.md @@ -6,7 +6,7 @@ Sandbox IR is an IR layer on top of LLVM IR that allows you to save/restore its Within your LLVM pass: -``` +``` C++ // 1. Include the necessary Sandbox IR header files. #include "llvm/SandboxIR/Context.h #include "llvm/SandboxIR/Function.h From 14b44179cb61dd551c911dea54de57b588621005 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 25 Jan 2025 17:43:16 +0100 Subject: [PATCH 095/432] [libc++][format][3/3] Improves formatting performance. (#108990) This changes the __output_buffer to a new structure. This improves the performace of std::format, std::format_to, std::format_to_n, and std::formatted_size. --- libcxx/include/__format/buffer.h | 623 ++++++++++-------- libcxx/include/__format/format_functions.h | 29 +- libcxx/include/module.modulemap | 5 +- .../format/format.functions/format_tests.h | 2 +- 4 files changed, 358 insertions(+), 301 deletions(-) diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h index 9509f19e16724..0c054bbc3a1d8 100644 --- a/libcxx/include/__format/buffer.h +++ b/libcxx/include/__format/buffer.h @@ -14,6 +14,7 @@ #include <__algorithm/fill_n.h> #include <__algorithm/max.h> #include <__algorithm/min.h> +#include <__algorithm/ranges_copy.h> #include <__algorithm/ranges_copy_n.h> #include <__algorithm/transform.h> #include <__algorithm/unwrap_iter.h> @@ -29,6 +30,7 @@ #include <__iterator/wrap_iter.h> #include <__memory/addressof.h> #include <__memory/allocate_at_least.h> +#include <__memory/allocator.h> #include <__memory/allocator_traits.h> #include <__memory/construct_at.h> #include <__memory/ranges_construct_at.h> @@ -37,6 +39,7 @@ #include <__type_traits/conditional.h> #include <__utility/exception_guard.h> #include <__utility/move.h> +#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -52,24 +55,147 @@ _LIBCPP_BEGIN_NAMESPACE_STD namespace __format { +// A helper to limit the total size of code units written. +class _LIBCPP_HIDE_FROM_ABI __max_output_size { +public: + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __max_output_size(size_t __max_size) : __max_size_{__max_size} {} + + // This function adjusts the size of a (bulk) write operations. It ensures the + // number of code units written by a __output_buffer never exceeds + // __max_size_ code units. + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __write_request(size_t __code_units) { + size_t __result = + __code_units_written_ < __max_size_ ? std::min(__code_units, __max_size_ - __code_units_written_) : 0; + __code_units_written_ += __code_units; + return __result; + } + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __code_units_written() const noexcept { return __code_units_written_; } + +private: + size_t __max_size_; + // The code units that would have been written if there was no limit. + // format_to_n returns this value. + size_t __code_units_written_{0}; +}; + /// A "buffer" that handles writing to the proper iterator. /// /// This helper is used together with the @ref back_insert_iterator to offer /// type-erasure for the formatting functions. This reduces the number to /// template instantiations. +/// +/// The design is the following: +/// - There is an external object that connects the buffer to the output. +/// - This buffer object: +/// - inherits publicly from this class. +/// - has a static or dynamic buffer. +/// - has a static member function to make space in its buffer write +/// operations. This can be done by increasing the size of the internal +/// buffer or by writing the contents of the buffer to the output iterator. +/// +/// This member function is a constructor argument, so its name is not +/// fixed. The code uses the name __prepare_write. +/// - The number of output code units can be limited by a __max_output_size +/// object. This is used in format_to_n This object: +/// - Contains the maximum number of code units to be written. +/// - Contains the number of code units that are requested to be written. +/// This number is returned to the user of format_to_n. +/// - The write functions call the object's __request_write member function. +/// This function: +/// - Updates the number of code units that are requested to be written. +/// - Returns the number of code units that can be written without +/// exceeding the maximum number of code units to be written. +/// +/// Documentation for the buffer usage members: +/// - __ptr_ +/// The start of the buffer. +/// - __capacity_ +/// The number of code units that can be written. This means +/// [__ptr_, __ptr_ + __capacity_) is a valid range to write to. +/// - __size_ +/// The number of code units written in the buffer. The next code unit will +/// be written at __ptr_ + __size_. This __size_ may NOT contain the total +/// number of code units written by the __output_buffer. Whether or not it +/// does depends on the sub-class used. Typically the total number of code +/// units written is not interesting. It is interesting for format_to_n which +/// has its own way to track this number. +/// +/// Documentation for the modifying buffer operations: +/// The subclasses have a function with the following signature: +/// +/// static void __prepare_write( +/// __output_buffer<_CharT>& __buffer, size_t __code_units); +/// +/// This function is called when a write function writes more code units than +/// the buffer's available space. When an __max_output_size object is provided +/// the number of code units is the number of code units returned from +/// __max_output_size::__request_write function. +/// +/// - The __buffer contains *this. Since the class containing this function +/// inherits from __output_buffer it's safe to cast it to the subclass being +/// used. +/// - The __code_units is the number of code units the caller will write + 1. +/// - This value does not take the available space of the buffer into account. +/// - The push_back function is more efficient when writing before resizing, +/// this means the buffer should always have room for one code unit. Hence +/// the + 1 is the size. +/// - When the function returns there is room for at least one additional code +/// unit. There is no requirement there is room for __code_units code units: +/// - The class has some "bulk" operations. For example, __copy which copies +/// the contents of a basic_string_view to the output. If the sub-class has +/// a fixed size buffer the size of the basic_string_view may be larger +/// than the buffer. In that case it's impossible to honor the requested +/// size. +/// - When the buffer has room for at least one code unit the function may be +/// a no-op. +/// - When the function makes space for more code units it uses one for these +/// functions to signal the change: +/// - __buffer_flushed() +/// - This function is typically used for a fixed sized buffer. +/// - The current contents of [__ptr_, __ptr_ + __size_) have been +/// processed. +/// - __ptr_ remains unchanged. +/// - __capacity_ remains unchanged. +/// - __size_ will be set to 0. +/// - __buffer_moved(_CharT* __ptr, size_t __capacity) +/// - This function is typically used for a dynamic sized buffer. There the +/// location of the buffer changes due to reallocations. +/// - __ptr_ will be set to __ptr. (This value may be the old value of +/// __ptr_). +/// - __capacity_ will be set to __capacity. (This value may be the old +/// value of __capacity_). +/// - __size_ remains unchanged, +/// - The range [__ptr, __ptr + __size_) contains the original data of the +/// range [__ptr_, __ptr_ + __size_). +/// +/// The push_back function expects a valid buffer and a capacity of at least 1. +/// This means: +/// - The class is constructed with a valid buffer, +/// - __buffer_moved is called with a valid buffer is used before the first +/// write operation, +/// - no write function is ever called, or +/// - the class is constructed with a __max_output_size object with __max_size 0. +/// +/// The latter option allows formatted_size to use the output buffer without +/// ever writing anything to the buffer. template <__fmt_char_type _CharT> class _LIBCPP_TEMPLATE_VIS __output_buffer { public: - using value_type = _CharT; + using value_type _LIBCPP_NODEBUG = _CharT; + using __prepare_write_type _LIBCPP_NODEBUG = void (*)(__output_buffer<_CharT>&, size_t); - template - _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(_CharT* __ptr, size_t __capacity, _Tp* __obj) - : __ptr_(__ptr), - __capacity_(__capacity), - __flush_([](_CharT* __p, size_t __n, void* __o) { static_cast<_Tp*>(__o)->__flush(__p, __n); }), - __obj_(__obj) {} + [[nodiscard]] + _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(_CharT* __ptr, size_t __capacity, __prepare_write_type __function) + : __output_buffer{__ptr, __capacity, __function, nullptr} {} - _LIBCPP_HIDE_FROM_ABI void __reset(_CharT* __ptr, size_t __capacity) { + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __output_buffer( + _CharT* __ptr, size_t __capacity, __prepare_write_type __function, __max_output_size* __max_output_size) + : __ptr_(__ptr), __capacity_(__capacity), __prepare_write_(__function), __max_output_size_(__max_output_size) {} + + _LIBCPP_HIDE_FROM_ABI void __buffer_flushed() { __size_ = 0; } + + _LIBCPP_HIDE_FROM_ABI void __buffer_moved(_CharT* __ptr, size_t __capacity) { __ptr_ = __ptr; __capacity_ = __capacity; } @@ -78,12 +204,18 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer { // Used in std::back_insert_iterator. _LIBCPP_HIDE_FROM_ABI void push_back(_CharT __c) { + if (__max_output_size_ && __max_output_size_->__write_request(1) == 0) + return; + + _LIBCPP_ASSERT_INTERNAL( + __ptr_ && __size_ < __capacity_ && __available() >= 1, "attempted to write outside the buffer"); + __ptr_[__size_++] = __c; // Profiling showed flushing after adding is more efficient than flushing // when entering the function. if (__size_ == __capacity_) - __flush(); + __prepare_write(0); } /// Copies the input __str to the buffer. @@ -104,25 +236,20 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer { // upper case. For integral these strings are short. // TODO FMT Look at the improvements above. size_t __n = __str.size(); - - __flush_on_overflow(__n); - if (__n < __capacity_) { // push_back requires the buffer to have room for at least one character (so use <). - std::copy_n(__str.data(), __n, std::addressof(__ptr_[__size_])); - __size_ += __n; - return; + if (__max_output_size_) { + __n = __max_output_size_->__write_request(__n); + if (__n == 0) + return; } - // The output doesn't fit in the internal buffer. - // Copy the data in "__capacity_" sized chunks. - _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow"); const _InCharT* __first = __str.data(); do { - size_t __chunk = std::min(__n, __capacity_); + __prepare_write(__n); + size_t __chunk = std::min(__n, __available()); std::copy_n(__first, __chunk, std::addressof(__ptr_[__size_])); - __size_ = __chunk; + __size_ += __chunk; __first += __chunk; __n -= __chunk; - __flush(); } while (__n); } @@ -136,121 +263,59 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer { _LIBCPP_ASSERT_INTERNAL(__first <= __last, "not a valid range"); size_t __n = static_cast(__last - __first); - __flush_on_overflow(__n); - if (__n < __capacity_) { // push_back requires the buffer to have room for at least one character (so use <). - std::transform(__first, __last, std::addressof(__ptr_[__size_]), std::move(__operation)); - __size_ += __n; - return; + if (__max_output_size_) { + __n = __max_output_size_->__write_request(__n); + if (__n == 0) + return; } - // The output doesn't fit in the internal buffer. - // Transform the data in "__capacity_" sized chunks. - _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow"); do { - size_t __chunk = std::min(__n, __capacity_); + __prepare_write(__n); + size_t __chunk = std::min(__n, __available()); std::transform(__first, __first + __chunk, std::addressof(__ptr_[__size_]), __operation); - __size_ = __chunk; + __size_ += __chunk; __first += __chunk; __n -= __chunk; - __flush(); } while (__n); } /// A \c fill_n wrapper. _LIBCPP_HIDE_FROM_ABI void __fill(size_t __n, _CharT __value) { - __flush_on_overflow(__n); - if (__n < __capacity_) { // push_back requires the buffer to have room for at least one character (so use <). - std::fill_n(std::addressof(__ptr_[__size_]), __n, __value); - __size_ += __n; - return; + if (__max_output_size_) { + __n = __max_output_size_->__write_request(__n); + if (__n == 0) + return; } - // The output doesn't fit in the internal buffer. - // Fill the buffer in "__capacity_" sized chunks. - _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow"); do { - size_t __chunk = std::min(__n, __capacity_); + __prepare_write(__n); + size_t __chunk = std::min(__n, __available()); std::fill_n(std::addressof(__ptr_[__size_]), __chunk, __value); - __size_ = __chunk; + __size_ += __chunk; __n -= __chunk; - __flush(); } while (__n); } - _LIBCPP_HIDE_FROM_ABI void __flush() { - __flush_(__ptr_, __size_, __obj_); - __size_ = 0; - } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __capacity() const { return __capacity_; } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __size() const { return __size_; } private: _CharT* __ptr_; size_t __capacity_; size_t __size_{0}; - void (*__flush_)(_CharT*, size_t, void*); - void* __obj_; - - /// Flushes the buffer when the output operation would overflow the buffer. - /// - /// A simple approach for the overflow detection would be something along the - /// lines: - /// \code - /// // The internal buffer is large enough. - /// if (__n <= __capacity_) { - /// // Flush when we really would overflow. - /// if (__size_ + __n >= __capacity_) - /// __flush(); - /// ... - /// } - /// \endcode - /// - /// This approach works for all cases but one: - /// A __format_to_n_buffer_base where \ref __enable_direct_output is true. - /// In that case the \ref __capacity_ of the buffer changes during the first - /// \ref __flush. During that operation the output buffer switches from its - /// __writer_ to its __storage_. The \ref __capacity_ of the former depends - /// on the value of n, of the latter is a fixed size. For example: - /// - a format_to_n call with a 10'000 char buffer, - /// - the buffer is filled with 9'500 chars, - /// - adding 1'000 elements would overflow the buffer so the buffer gets - /// changed and the \ref __capacity_ decreases from 10'000 to - /// __buffer_size (256 at the time of writing). - /// - /// This means that the \ref __flush for this class may need to copy a part of - /// the internal buffer to the proper output. In this example there will be - /// 500 characters that need this copy operation. - /// - /// Note it would be more efficient to write 500 chars directly and then swap - /// the buffers. This would make the code more complex and \ref format_to_n is - /// not the most common use case. Therefore the optimization isn't done. - _LIBCPP_HIDE_FROM_ABI void __flush_on_overflow(size_t __n) { - if (__size_ + __n >= __capacity_) - __flush(); - } -}; - -/// A storage using an internal buffer. -/// -/// This storage is used when writing a single element to the output iterator -/// is expensive. -template <__fmt_char_type _CharT> -class _LIBCPP_TEMPLATE_VIS __internal_storage { -public: - _LIBCPP_HIDE_FROM_ABI _CharT* __begin() { return __buffer_; } + void (*__prepare_write_)(__output_buffer<_CharT>&, size_t); + __max_output_size* __max_output_size_; - static constexpr size_t __buffer_size = 256 / sizeof(_CharT); + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __available() const { return __capacity_ - __size_; } -private: - _CharT __buffer_[__buffer_size]; + _LIBCPP_HIDE_FROM_ABI void __prepare_write(size_t __code_units) { + // Always have space for one additional code unit. This is a precondition of the push_back function. + __code_units += 1; + if (__available() < __code_units) + __prepare_write_(*this, __code_units + 1); + } }; -/// A storage writing directly to the storage. -/// -/// This requires the storage to be a contiguous buffer of \a _CharT. -/// Since the output is directly written to the underlying storage this class -/// is just an empty class. -template <__fmt_char_type _CharT> -class _LIBCPP_TEMPLATE_VIS __direct_storage {}; - template concept __enable_direct_output = __fmt_char_type<_CharT> && @@ -259,40 +324,6 @@ concept __enable_direct_output = // `#ifdef`. || same_as<_OutIt, __wrap_iter<_CharT*>>); -/// Write policy for directly writing to the underlying output. -template -class _LIBCPP_TEMPLATE_VIS __writer_direct { -public: - _LIBCPP_HIDE_FROM_ABI explicit __writer_direct(_OutIt __out_it) : __out_it_(__out_it) {} - - _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() { return __out_it_; } - - _LIBCPP_HIDE_FROM_ABI void __flush(_CharT*, size_t __n) { - // _OutIt can be a __wrap_iter. Therefore the original iterator - // is adjusted. - __out_it_ += __n; - } - -private: - _OutIt __out_it_; -}; - -/// Write policy for copying the buffer to the output. -template -class _LIBCPP_TEMPLATE_VIS __writer_iterator { -public: - _LIBCPP_HIDE_FROM_ABI explicit __writer_iterator(_OutIt __out_it) : __out_it_{std::move(__out_it)} {} - - _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { return std::move(__out_it_); } - - _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { - __out_it_ = std::ranges::copy_n(__ptr, __n, std::move(__out_it_)).out; - } - -private: - _OutIt __out_it_; -}; - /// Concept to see whether a \a _Container is insertable. /// /// The concept is used to validate whether multiple calls to a @@ -310,196 +341,220 @@ concept __insertable = /// Extract the container type of a \ref back_insert_iterator. template struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container { - using type = void; + using type _LIBCPP_NODEBUG = void; }; template <__insertable _Container> struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container> { - using type = _Container; + using type _LIBCPP_NODEBUG = _Container; }; -/// Write policy for inserting the buffer in a container. -template -class _LIBCPP_TEMPLATE_VIS __writer_container { +// A dynamically growing buffer. +template <__fmt_char_type _CharT> +class _LIBCPP_TEMPLATE_VIS __allocating_buffer : public __output_buffer<_CharT> { public: - using _CharT _LIBCPP_NODEBUG = typename _Container::value_type; + __allocating_buffer(const __allocating_buffer&) = delete; + __allocating_buffer& operator=(const __allocating_buffer&) = delete; - _LIBCPP_HIDE_FROM_ABI explicit __writer_container(back_insert_iterator<_Container> __out_it) - : __container_{__out_it.__get_container()} {} + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __allocating_buffer() : __allocating_buffer{nullptr} {} - _LIBCPP_HIDE_FROM_ABI auto __out_it() { return std::back_inserter(*__container_); } + [[nodiscard]] + _LIBCPP_HIDE_FROM_ABI explicit __allocating_buffer(__max_output_size* __max_output_size) + : __output_buffer<_CharT>{__small_buffer_, __buffer_size_, __prepare_write, __max_output_size} {} - _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { - __container_->insert(__container_->end(), __ptr, __ptr + __n); + _LIBCPP_HIDE_FROM_ABI ~__allocating_buffer() { + if (__ptr_ != __small_buffer_) + _Alloc{}.deallocate(__ptr_, this->__capacity()); } -private: - _Container* __container_; -}; + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI basic_string_view<_CharT> __view() { return {__ptr_, this->__size()}; } -/// Selects the type of the writer used for the output iterator. -template -class _LIBCPP_TEMPLATE_VIS __writer_selector { - using _Container _LIBCPP_NODEBUG = typename __back_insert_iterator_container<_OutIt>::type; +private: + using _Alloc _LIBCPP_NODEBUG = allocator<_CharT>; -public: - using type = - conditional_t, - __writer_container<_Container>, - conditional_t<__enable_direct_output<_OutIt, _CharT>, - __writer_direct<_OutIt, _CharT>, - __writer_iterator<_OutIt, _CharT>>>; -}; + // Since allocating is expensive the class has a small internal buffer. When + // its capacity is exceeded a dynamic buffer will be allocated. + static constexpr size_t __buffer_size_ = 256; + _CharT __small_buffer_[__buffer_size_]; -/// The generic formatting buffer. -template - requires(output_iterator<_OutIt, const _CharT&>) -class _LIBCPP_TEMPLATE_VIS __format_buffer { - using _Storage _LIBCPP_NODEBUG = - conditional_t<__enable_direct_output<_OutIt, _CharT>, __direct_storage<_CharT>, __internal_storage<_CharT>>; + _CharT* __ptr_{__small_buffer_}; -public: - _LIBCPP_HIDE_FROM_ABI explicit __format_buffer(_OutIt __out_it) - requires(same_as<_Storage, __internal_storage<_CharT>>) - : __output_(__storage_.__begin(), __storage_.__buffer_size, this), __writer_(std::move(__out_it)) {} + _LIBCPP_HIDE_FROM_ABI void __grow_buffer(size_t __capacity) { + if (__capacity < __buffer_size_) + return; - _LIBCPP_HIDE_FROM_ABI explicit __format_buffer(_OutIt __out_it) - requires(same_as<_Storage, __direct_storage<_CharT>>) - : __output_(std::__unwrap_iter(__out_it), size_t(-1), this), __writer_(std::move(__out_it)) {} + _LIBCPP_ASSERT_INTERNAL(__capacity > this->__capacity(), "the buffer must grow"); - _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return __output_.__make_output_iterator(); } + // _CharT is an implicit lifetime type so can be used without explicit + // construction or destruction. + _Alloc __alloc; + auto __result = std::__allocate_at_least(__alloc, __capacity); + std::copy_n(__ptr_, this->__size(), __result.ptr); + if (__ptr_ != __small_buffer_) + __alloc.deallocate(__ptr_, this->__capacity()); - _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { __writer_.__flush(__ptr, __n); } + __ptr_ = __result.ptr; + this->__buffer_moved(__ptr_, __result.count); + } - _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { - __output_.__flush(); - return std::move(__writer_).__out_it(); + _LIBCPP_HIDE_FROM_ABI void __prepare_write(size_t __size_hint) { + __grow_buffer(std::max(this->__capacity() + __size_hint, this->__capacity() * 1.6)); } -private: - _LIBCPP_NO_UNIQUE_ADDRESS _Storage __storage_; - __output_buffer<_CharT> __output_; - typename __writer_selector<_OutIt, _CharT>::type __writer_; + _LIBCPP_HIDE_FROM_ABI static void __prepare_write(__output_buffer<_CharT>& __buffer, size_t __size_hint) { + static_cast<__allocating_buffer<_CharT>&>(__buffer).__prepare_write(__size_hint); + } }; -/// A buffer that counts the number of insertions. -/// -/// Since \ref formatted_size only needs to know the size, the output itself is -/// discarded. -template <__fmt_char_type _CharT> -class _LIBCPP_TEMPLATE_VIS __formatted_size_buffer { +// A buffer that directly writes to the underlying buffer. +template +class _LIBCPP_TEMPLATE_VIS __direct_iterator_buffer : public __output_buffer<_CharT> { public: - _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return __output_.__make_output_iterator(); } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __direct_iterator_buffer(_OutIt __out_it) + : __direct_iterator_buffer{__out_it, nullptr} {} - _LIBCPP_HIDE_FROM_ABI void __flush(const _CharT*, size_t __n) { __size_ += __n; } + [[nodiscard]] + _LIBCPP_HIDE_FROM_ABI explicit __direct_iterator_buffer(_OutIt __out_it, __max_output_size* __max_output_size) + : __output_buffer<_CharT>{std::__unwrap_iter(__out_it), __buffer_size, __prepare_write, __max_output_size}, + __out_it_(__out_it) {} - _LIBCPP_HIDE_FROM_ABI size_t __result() && { - __output_.__flush(); - return __size_; - } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { return __out_it_ + this->__size(); } private: - __internal_storage<_CharT> __storage_; - __output_buffer<_CharT> __output_{__storage_.__begin(), __storage_.__buffer_size, this}; - size_t __size_{0}; -}; + // The function format_to expects a buffer large enough for the output. The + // function format_to_n has its own helper class that restricts the number of + // write options. So this function class can pretend to have an infinite + // buffer. + static constexpr size_t __buffer_size = -1; + + _OutIt __out_it_; -/// The base of a buffer that counts and limits the number of insertions. -template - requires(output_iterator<_OutIt, const _CharT&>) -struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base { - using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>; + _LIBCPP_HIDE_FROM_ABI static void + __prepare_write([[maybe_unused]] __output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) { + std::__throw_length_error("__direct_iterator_buffer"); + } +}; +// A buffer that writes its output to the end of a container. +template +class _LIBCPP_TEMPLATE_VIS __container_inserter_buffer : public __output_buffer<_CharT> { public: - _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size) - : __writer_(std::move(__out_it)), __max_size_(std::max(_Size(0), __max_size)) {} + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __container_inserter_buffer(_OutIt __out_it) + : __container_inserter_buffer{__out_it, nullptr} {} - _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { - if (_Size(__size_) <= __max_size_) - __writer_.__flush(__ptr, std::min(_Size(__n), __max_size_ - __size_)); - __size_ += __n; + [[nodiscard]] + _LIBCPP_HIDE_FROM_ABI explicit __container_inserter_buffer(_OutIt __out_it, __max_output_size* __max_output_size) + : __output_buffer<_CharT>{__small_buffer_, __buffer_size, __prepare_write, __max_output_size}, + __container_{__out_it.__get_container()} {} + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __out_it() && { + __container_->insert(__container_->end(), __small_buffer_, __small_buffer_ + this->__size()); + return std::back_inserter(*__container_); } -protected: - __internal_storage<_CharT> __storage_; - __output_buffer<_CharT> __output_{__storage_.__begin(), __storage_.__buffer_size, this}; - typename __writer_selector<_OutIt, _CharT>::type __writer_; +private: + typename __back_insert_iterator_container<_OutIt>::type* __container_; + + // This class uses a fixed size buffer and appends the elements in + // __buffer_size chunks. An alternative would be to use an allocating buffer + // and append the output in a single write operation. Benchmarking showed no + // performance difference. + static constexpr size_t __buffer_size = 256; + _CharT __small_buffer_[__buffer_size]; + + _LIBCPP_HIDE_FROM_ABI void __prepare_write() { + __container_->insert(__container_->end(), __small_buffer_, __small_buffer_ + this->__size()); + this->__buffer_flushed(); + } - _Size __max_size_; - _Size __size_{0}; + _LIBCPP_HIDE_FROM_ABI static void + __prepare_write(__output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) { + static_cast<__container_inserter_buffer<_OutIt, _CharT>&>(__buffer).__prepare_write(); + } }; -/// The base of a buffer that counts and limits the number of insertions. -/// -/// This version is used when \c __enable_direct_output<_OutIt, _CharT> == true. -/// -/// This class limits the size available to the direct writer so it will not -/// exceed the maximum number of code units. +// A buffer that writes to an iterator. +// +// Unlike the __container_inserter_buffer this class' performance does benefit +// from allocating and then inserting. template - requires(output_iterator<_OutIt, const _CharT&>) -class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base<_OutIt, _CharT, true> { - using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>; - +class _LIBCPP_TEMPLATE_VIS __iterator_buffer : public __allocating_buffer<_CharT> { public: - _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size) - : __output_(std::__unwrap_iter(__out_it), __max_size, this), - __writer_(std::move(__out_it)), - __max_size_(__max_size) { - if (__max_size <= 0) [[unlikely]] - __output_.__reset(__storage_.__begin(), __storage_.__buffer_size); - } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __iterator_buffer(_OutIt __out_it) + : __allocating_buffer<_CharT>{}, __out_it_{std::move(__out_it)} {} - _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { - // A __flush to the direct writer happens in the following occasions: - // - The format function has written the maximum number of allowed code - // units. At this point it's no longer valid to write to this writer. So - // switch to the internal storage. This internal storage doesn't need to - // be written anywhere so the __flush for that storage writes no output. - // - Like above, but the next "mass write" operation would overflow the - // buffer. In that case the buffer is pre-emptively switched. The still - // valid code units will be written separately. - // - The format_to_n function is finished. In this case there's no need to - // switch the buffer, but for simplicity the buffers are still switched. - // When the __max_size <= 0 the constructor already switched the buffers. - if (__size_ == 0 && __ptr != __storage_.__begin()) { - __writer_.__flush(__ptr, __n); - __output_.__reset(__storage_.__begin(), __storage_.__buffer_size); - } else if (__size_ < __max_size_) { - // Copies a part of the internal buffer to the output up to n characters. - // See __output_buffer<_CharT>::__flush_on_overflow for more information. - _Size __s = std::min(_Size(__n), __max_size_ - __size_); - std::copy_n(__ptr, __s, __writer_.__out_it()); - __writer_.__flush(__ptr, __s); - } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __iterator_buffer(_OutIt __out_it, __max_output_size* __max_output_size) + : __allocating_buffer<_CharT>{__max_output_size}, __out_it_{std::move(__out_it)} {} - __size_ += __n; + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __out_it() && { + return std::ranges::copy(this->__view(), std::move(__out_it_)).out; } -protected: - __internal_storage<_CharT> __storage_; - __output_buffer<_CharT> __output_; - __writer_direct<_OutIt, _CharT> __writer_; +private: + _OutIt __out_it_; +}; + +// Selects the type of the buffer used for the output iterator. +template +class _LIBCPP_TEMPLATE_VIS __buffer_selector { + using _Container _LIBCPP_NODEBUG = __back_insert_iterator_container<_OutIt>::type; - _Size __max_size_; - _Size __size_{0}; +public: + using type _LIBCPP_NODEBUG = + conditional_t, + __container_inserter_buffer<_OutIt, _CharT>, + conditional_t<__enable_direct_output<_OutIt, _CharT>, + __direct_iterator_buffer<_OutIt, _CharT>, + __iterator_buffer<_OutIt, _CharT>>>; }; -/// The buffer that counts and limits the number of insertions. +// A buffer that counts and limits the number of insertions. template - requires(output_iterator<_OutIt, const _CharT&>) -struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer final - : public __format_to_n_buffer_base< _OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>> { - using _Base _LIBCPP_NODEBUG = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>; - using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>; +class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer : private __buffer_selector<_OutIt, _CharT>::type { +public: + using _Base _LIBCPP_NODEBUG = __buffer_selector<_OutIt, _CharT>::type; + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __format_to_n_buffer(_OutIt __out_it, iter_difference_t<_OutIt> __n) + : _Base{std::move(__out_it), std::addressof(__max_output_size_)}, + __max_output_size_{__n < 0 ? size_t{0} : static_cast(__n)} {} + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return _Base::__make_output_iterator(); } + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __result() && { + return {static_cast<_Base&&>(*this).__out_it(), + static_cast>(__max_output_size_.__code_units_written())}; + } + +private: + __max_output_size __max_output_size_; +}; +// A buffer that counts the number of insertions. +// +// Since formatted_size only needs to know the size, the output itself is +// discarded. +template <__fmt_char_type _CharT> +class _LIBCPP_TEMPLATE_VIS __formatted_size_buffer : private __output_buffer<_CharT> { public: - _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer(_OutIt __out_it, _Size __max_size) - : _Base(std::move(__out_it), __max_size) {} - _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return this->__output_.__make_output_iterator(); } + using _Base _LIBCPP_NODEBUG = __output_buffer<_CharT>; + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __formatted_size_buffer() + : _Base{nullptr, 0, __prepare_write, std::addressof(__max_output_size_)} {} + + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return _Base::__make_output_iterator(); } + + // This function does not need to be r-value qualified, however this is + // consistent with similar objects. + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __result() && { return __max_output_size_.__code_units_written(); } + +private: + __max_output_size __max_output_size_{0}; - _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __result() && { - this->__output_.__flush(); - return {std::move(this->__writer_).__out_it(), this->__size_}; + _LIBCPP_HIDE_FROM_ABI static void + __prepare_write([[maybe_unused]] __output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) { + // Note this function does not satisfy the requirement of giving a 1 code unit buffer. + _LIBCPP_ASSERT_INTERNAL( + false, "Since __max_output_size_.__max_size_ == 0 there should never be call to this function."); } }; @@ -526,11 +581,11 @@ class _LIBCPP_TEMPLATE_VIS __retarget_buffer { using _Alloc _LIBCPP_NODEBUG = allocator<_CharT>; public: - using value_type = _CharT; + using value_type _LIBCPP_NODEBUG = _CharT; struct __iterator { - using difference_type = ptrdiff_t; - using value_type = _CharT; + using difference_type _LIBCPP_NODEBUG = ptrdiff_t; + using value_type _LIBCPP_NODEBUG = _CharT; _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(__retarget_buffer& __buffer) : __buffer_(std::addressof(__buffer)) {} diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h index b920be5acbe86..5feaf7e5a064a 100644 --- a/libcxx/include/__format/format_functions.h +++ b/libcxx/include/__format/format_functions.h @@ -31,7 +31,6 @@ #include <__format/formatter_pointer.h> #include <__format/formatter_string.h> #include <__format/parser_std_format_spec.h> -#include <__iterator/back_insert_iterator.h> #include <__iterator/concepts.h> #include <__iterator/incrementable_traits.h> #include <__iterator/iterator_traits.h> // iter_value_t @@ -411,7 +410,7 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __vformat_to(_OutIt __out_it, return std::__format::__vformat_to( basic_format_parse_context{__fmt, __args.__size()}, std::__format_context_create(std::move(__out_it), __args)); else { - __format::__format_buffer<_OutIt, _CharT> __buffer{std::move(__out_it)}; + typename __format::__buffer_selector<_OutIt, _CharT>::type __buffer{std::move(__out_it)}; std::__format::__vformat_to(basic_format_parse_context{__fmt, __args.__size()}, std::__format_context_create(__buffer.__make_output_iterator(), __args)); return std::move(__buffer).__out_it(); @@ -452,9 +451,9 @@ format_to(_OutIt __out_it, wformat_string<_Args...> __fmt, _Args&&... __args) { // fires too eagerly, see http://llvm.org/PR61563. template [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string vformat(string_view __fmt, format_args __args) { - string __res; - std::vformat_to(std::back_inserter(__res), __fmt, __args); - return __res; + __format::__allocating_buffer __buffer; + std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args); + return string{__buffer.__view()}; } # if _LIBCPP_HAS_WIDE_CHARACTERS @@ -463,9 +462,9 @@ template template [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring vformat(wstring_view __fmt, wformat_args __args) { - wstring __res; - std::vformat_to(std::back_inserter(__res), __fmt, __args); - return __res; + __format::__allocating_buffer __buffer; + std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args); + return wstring{__buffer.__view()}; } # endif @@ -544,7 +543,7 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __vformat_to( return std::__format::__vformat_to(basic_format_parse_context{__fmt, __args.__size()}, std::__format_context_create(std::move(__out_it), __args, std::move(__loc))); else { - __format::__format_buffer<_OutIt, _CharT> __buffer{std::move(__out_it)}; + typename __format::__buffer_selector<_OutIt, _CharT>::type __buffer{std::move(__out_it)}; std::__format::__vformat_to( basic_format_parse_context{__fmt, __args.__size()}, std::__format_context_create(__buffer.__make_output_iterator(), __args, std::move(__loc))); @@ -585,9 +584,9 @@ format_to(_OutIt __out_it, locale __loc, wformat_string<_Args...> __fmt, _Args&& template [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string vformat(locale __loc, string_view __fmt, format_args __args) { - string __res; - std::vformat_to(std::back_inserter(__res), std::move(__loc), __fmt, __args); - return __res; + __format::__allocating_buffer __buffer; + std::vformat_to(__buffer.__make_output_iterator(), std::move(__loc), __fmt, __args); + return string{__buffer.__view()}; } # if _LIBCPP_HAS_WIDE_CHARACTERS @@ -596,9 +595,9 @@ vformat(locale __loc, string_view __fmt, format_args __args) { template [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring vformat(locale __loc, wstring_view __fmt, wformat_args __args) { - wstring __res; - std::vformat_to(std::back_inserter(__res), std::move(__loc), __fmt, __args); - return __res; + __format::__allocating_buffer __buffer; + std::vformat_to(__buffer.__make_output_iterator(), std::move(__loc), __fmt, __args); + return wstring{__buffer.__view()}; } # endif diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 85b88ca137f85..6c2fb8dc3940b 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1253,7 +1253,10 @@ module std [system] { } module format { - module buffer { header "__format/buffer.h" } + module buffer { + header "__format/buffer.h" + export std.iterator.back_insert_iterator + } module concepts { header "__format/concepts.h" } module container_adaptor { header "__format/container_adaptor.h" } module enable_insertable { header "__format/enable_insertable.h" } diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h index b2ed6775fe8a1..3969b341cb146 100644 --- a/libcxx/test/std/utilities/format/format.functions/format_tests.h +++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h @@ -3038,7 +3038,7 @@ void format_test_buffer_optimizations(TestFunction check) { // Used to validate our test sets are the proper size. // To test the chunked operations it needs to be larger than the internal // buffer. Picked a nice looking number. - constexpr int minimum = 3 * std::__format::__internal_storage::__buffer_size; + constexpr int minimum = 3 * 256; #else constexpr int minimum = 1; #endif From 8b6211472793680994f7bc15abb5910d0a916cc5 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Sat, 23 Nov 2024 16:29:19 -0800 Subject: [PATCH 096/432] [lldb] Delete unused lldbutil.print_registers (NFC) --- .../Python/lldbsuite/test/lldbutil.py | 27 ------------------- .../API/macosx/universal/TestUniversal.py | 2 -- 2 files changed, 29 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py index 07b5f8cc7d900..ef068cf7f9ed1 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbutil.py +++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py @@ -1353,33 +1353,6 @@ def get_args_as_string(frame, showFuncName=True): return "(%s)" % (", ".join(args)) -def print_registers(frame, string_buffer=False): - """Prints all the register sets of the frame.""" - - output = io.StringIO() if string_buffer else sys.stdout - - print("Register sets for " + str(frame), file=output) - - registerSet = frame.GetRegisters() # Return type of SBValueList. - print( - "Frame registers (size of register set = %d):" % registerSet.GetSize(), - file=output, - ) - for value in registerSet: - # print(value, file=output) - print( - "%s (number of children = %d):" % (value.GetName(), value.GetNumChildren()), - file=output, - ) - for child in value: - print( - "Name: %s, Value: %s" % (child.GetName(), child.GetValue()), file=output - ) - - if string_buffer: - return output.getvalue() - - def get_registers(frame, kind): """Returns the registers given the frame and the kind of registers desired. diff --git a/lldb/test/API/macosx/universal/TestUniversal.py b/lldb/test/API/macosx/universal/TestUniversal.py index aecc8814b377e..3c043df641978 100644 --- a/lldb/test/API/macosx/universal/TestUniversal.py +++ b/lldb/test/API/macosx/universal/TestUniversal.py @@ -57,8 +57,6 @@ def test_sbdebugger_create_target_with_file_and_target_triple(self): @skipIf(compiler="clang", compiler_version=["<", "7.0"]) def test_process_launch_for_universal(self): """Test process launch of a universal binary.""" - from lldbsuite.test.lldbutil import print_registers - if not haswellOrLater(): return From b178c2d63e0701655046dfd2ead195b36e0df397 Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Sat, 25 Jan 2025 08:28:51 -0800 Subject: [PATCH 097/432] [SandboxVec][DAG] Fix trim schedule Fix trimSchedule by skipping instructions without a DAG Node. --- llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp index 06c1ef6b6d5ae..9ec5d830d8b4a 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp @@ -172,6 +172,8 @@ void Scheduler::trimSchedule(ArrayRef Instrs) { for (auto *I = LowestI, *E = TopI->getPrevNode(); I != E; I = I->getPrevNode()) { auto *N = DAG.getNode(I); + if (N == nullptr) + continue; if (auto *SB = N->getSchedBundle()) eraseBundle(SB); } From aba0476f23fc2a851792e9d85c25ee34a5ea7ed0 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Sat, 23 Nov 2024 16:30:57 -0800 Subject: [PATCH 098/432] [lldb] Delete lldbutil.PrintableRegex (NFC) Use of this class wasn't making use of the original regex string. Note that `re.Pattern` has a `pattern` property to access the original regex. --- lldb/packages/Python/lldbsuite/test/lldbutil.py | 15 --------------- .../libcxx/atomic/TestLibCxxAtomic.py | 5 ++--- .../libcxx/initializerlist/TestInitializerList.py | 5 ++--- 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py index ef068cf7f9ed1..27e0040034370 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbutil.py +++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py @@ -1578,21 +1578,6 @@ def set_actions_for_signal( ) -class PrintableRegex(object): - def __init__(self, text): - self.regex = re.compile(text) - self.text = text - - def match(self, str): - return self.regex.match(str) - - def __str__(self): - return "%s" % (self.text) - - def __repr__(self): - return "re.compile(%s) -> %s" % (self.text, self.regex) - - def skip_if_callable(test, mycallable, reason): if callable(mycallable): if mycallable(test): diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py index 241226d50df80..c6592ede03147 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py @@ -2,6 +2,7 @@ Test lldb data formatter subsystem. """ +import re import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -30,9 +31,7 @@ def test(self): self.runCmd("run", RUN_SUCCEEDED) - lldbutil.skip_if_library_missing( - self, self.target(), lldbutil.PrintableRegex("libc\+\+") - ) + lldbutil.skip_if_library_missing(self, self.target(), re.compile(r"libc\+\+")) # The stop reason of the thread should be breakpoint. self.expect( diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py index 93d5392830b50..b8a1dd3569d77 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py @@ -3,6 +3,7 @@ """ +import re import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -24,9 +25,7 @@ def test(self): self.runCmd("run", RUN_SUCCEEDED) - lldbutil.skip_if_library_missing( - self, self.target(), lldbutil.PrintableRegex("libc\+\+") - ) + lldbutil.skip_if_library_missing(self, self.target(), re.compile(r"libc\+\+")) # The stop reason of the thread should be breakpoint. self.expect( From def50f701f6a2c1e0550bb341fd8b64bed299e72 Mon Sep 17 00:00:00 2001 From: Hui Date: Sat, 25 Jan 2025 18:30:00 +0000 Subject: [PATCH 099/432] [libc++] implement `std::flat_multimap` (#113835) fixes https://github.com/llvm/llvm-project/issues/105190 --------- Co-authored-by: Hui Xie Co-authored-by: Hui Xie --- libcxx/docs/FeatureTestMacroTable.rst | 4 + libcxx/docs/Status/Cxx23Papers.csv | 2 +- libcxx/include/CMakeLists.txt | 3 + libcxx/include/__flat_map/flat_map.h | 123 +- libcxx/include/__flat_map/flat_multimap.h | 1010 +++++++++++++++++ libcxx/include/__flat_map/sorted_equivalent.h | 31 + libcxx/include/__flat_map/utils.h | 103 ++ libcxx/include/__functional/is_transparent.h | 6 +- libcxx/include/flat_map | 21 + libcxx/include/module.modulemap | 13 +- libcxx/include/version | 4 + libcxx/modules/std/flat_map.inc | 4 +- .../flat.map/assert.input_range.pass.cpp | 0 .../flat.map/assert.sorted_unique.pass.cpp | 0 .../flat.multimap/assert.input_range.pass.cpp | 66 ++ .../assert.sorted_equivalent.pass.cpp | 225 ++++ .../flat.map.syn/sorted_equivalent.pass.cpp | 50 + .../flat.map/flat.map.capacity/empty.pass.cpp | 4 +- .../flat.map.capacity/empty.verify.cpp | 6 +- .../flat.map.cons/deduct.compile.pass.cpp | 52 + .../flat.map/flat.map.cons/deduct.pass.cpp | 63 +- .../flat.map/flat.map.cons/deduct.verify.cpp | 44 - .../flat.map.cons/default_noexcept.pass.cpp | 2 + .../flat.map.cons/dtor_noexcept.pass.cpp | 6 +- .../flat.multimap.capacity/empty.pass.cpp | 51 + .../flat.multimap.capacity/empty.verify.cpp | 22 + .../flat.multimap.capacity/max_size.pass.cpp | 78 ++ .../flat.multimap.capacity/size.pass.cpp | 70 ++ .../flat.multimap.cons/alloc.pass.cpp | 72 ++ .../assign_initializer_list.pass.cpp | 58 + .../flat.multimap.cons/compare.pass.cpp | 93 ++ .../flat.multimap.cons/containers.pass.cpp | 187 +++ .../flat.multimap.cons/copy.pass.cpp | 70 ++ .../flat.multimap.cons/copy_alloc.pass.cpp | 67 ++ .../copy_assign.addressof.compile.pass.cpp | 30 + .../flat.multimap.cons/copy_assign.pass.cpp | 81 ++ .../deduct.compile.pass.cpp | 52 + .../flat.multimap.cons/deduct.pass.cpp | 343 ++++++ .../flat.multimap.cons/deduct.verify.cpp | 57 + .../flat.multimap.cons/deduct_pmr.pass.cpp | 107 ++ .../flat.multimap.cons/default.pass.cpp | 72 ++ .../default_noexcept.pass.cpp | 61 + .../flat.multimap.cons/dtor_noexcept.pass.cpp | 57 + .../initializer_list.pass.cpp | 159 +++ .../flat.multimap.cons/iter_iter.pass.cpp | 154 +++ .../flat.multimap.cons/move.pass.cpp | 89 ++ .../flat.multimap.cons/move_alloc.pass.cpp | 82 ++ .../flat.multimap.cons/move_assign.pass.cpp | 74 ++ .../move_assign_clears.pass.cpp | 101 ++ .../move_assign_noexcept.pass.cpp | 110 ++ .../move_exceptions.pass.cpp | 71 ++ .../flat.multimap.cons/move_noexcept.pass.cpp | 104 ++ .../flat.multimap.cons/pmr.pass.cpp | 361 ++++++ .../flat.multimap.cons/range.pass.cpp | 227 ++++ .../sorted_container.pass.cpp | 165 +++ .../sorted_initializer_list.pass.cpp | 183 +++ .../sorted_iter_iter.pass.cpp | 173 +++ .../flat.multimap.erasure/erase_if.pass.cpp | 98 ++ .../erase_if_exceptions.pass.cpp | 157 +++ .../flat.multimap.iterators/iterator.pass.cpp | 105 ++ .../iterator_comparison.pass.cpp | 155 +++ ...rator_concept_conformance.compile.pass.cpp | 84 ++ ...range_concept_conformance.compile.pass.cpp | 55 + .../reverse_iterator.pass.cpp | 92 ++ .../flat.multimap.modifiers/clear.pass.cpp | 64 ++ .../flat.multimap.modifiers/emplace.pass.cpp | 158 +++ .../emplace_hint.pass.cpp | 228 ++++ .../erase_iter.pass.cpp | 127 +++ .../erase_iter_iter.pass.cpp | 99 ++ .../erase_key.pass.cpp | 99 ++ .../erase_key_transparent.pass.cpp | 161 +++ .../flat.multimap.modifiers/extract.pass.cpp | 93 ++ .../insert_cv.pass.cpp | 81 ++ .../insert_initializer_list.pass.cpp | 83 ++ .../insert_iter_cv.pass.cpp | 95 ++ .../insert_iter_iter.pass.cpp | 109 ++ .../insert_iter_rv.pass.cpp | 103 ++ .../insert_range.pass.cpp | 101 ++ .../insert_rv.pass.cpp | 116 ++ .../insert_sorted_initializer_list.pass.cpp | 66 ++ .../insert_sorted_iter_iter.pass.cpp | 94 ++ .../insert_transparent.pass.cpp | 135 +++ .../flat.multimap.modifiers/replace.pass.cpp | 82 ++ .../swap_exception.pass.cpp | 80 ++ .../swap_free.pass.cpp | 99 ++ .../swap_member.pass.cpp | 97 ++ .../flat.multimap.observers/comp.pass.cpp | 98 ++ .../keys_values.pass.cpp | 59 + .../contains.pass.cpp | 72 ++ .../contains_transparent.pass.cpp | 73 ++ .../flat.multimap.operations/count.pass.cpp | 71 ++ .../count_transparent.pass.cpp | 83 ++ .../equal_range.pass.cpp | 81 ++ .../equal_range_transparent.pass.cpp | 110 ++ .../flat.multimap.operations/find.pass.cpp | 57 + .../find_transparent.pass.cpp | 99 ++ .../lower_bound.pass.cpp | 73 ++ .../lower_bound_transparent.pass.cpp | 107 ++ .../upper_bound.pass.cpp | 76 ++ .../upper_bound_transparent.pass.cpp | 106 ++ .../flat.multimap/helpers.h | 389 +++++++ .../flat.multimap/incomplete_type.pass.cpp | 33 + .../flat.multimap/op_compare.pass.cpp | 133 +++ .../flat.multimap/types.compile.pass.cpp | 133 +++ .../flat_map.version.compile.pass.cpp | 68 ++ .../version.version.compile.pass.cpp | 74 ++ .../generate_feature_test_macro_components.py | 11 + 107 files changed, 10468 insertions(+), 177 deletions(-) create mode 100644 libcxx/include/__flat_map/flat_multimap.h create mode 100644 libcxx/include/__flat_map/sorted_equivalent.h create mode 100644 libcxx/include/__flat_map/utils.h rename libcxx/test/libcxx/containers/{containers.adaptors => container.adaptors}/flat.map/assert.input_range.pass.cpp (100%) rename libcxx/test/libcxx/containers/{containers.adaptors => container.adaptors}/flat.map/assert.sorted_unique.pass.cpp (100%) create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp create mode 100644 libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index cfb0e5cfb129c..ccaa784ccb088 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -330,6 +330,10 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_expected`` ``202211L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_flat_map`` ``202207L`` + ---------------------------------------------------------- ----------------- + ``__cpp_lib_flat_set`` *unimplemented* + ---------------------------------------------------------- ----------------- ``__cpp_lib_format_ranges`` ``202207L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_formatters`` *unimplemented* diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 24398574064e6..bc9d4f8866a73 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -52,7 +52,7 @@ "`P2443R1 `__","``views::chunk_by``","2022-02 (Virtual)","|Complete|","18","" "","","","","","" "`P0009R18 `__","mdspan: A Non-Owning Multidimensional Array Reference","2022-07 (Virtual)","|Complete|","18","" -"`P0429R9 `__","A Standard ``flat_map``","2022-07 (Virtual)","|In Progress|","","" +"`P0429R9 `__","A Standard ``flat_map``","2022-07 (Virtual)","|Complete|","","" "`P1169R4 `__","``static operator()``","2022-07 (Virtual)","|Complete|","16","" "`P1222R4 `__","A Standard ``flat_set``","2022-07 (Virtual)","","","" "`P1223R5 `__","``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``","2022-07 (Virtual)","|Complete|","19","" diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 78d3192542b5a..8dac823503d73 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -362,8 +362,11 @@ set(files __filesystem/space_info.h __filesystem/u8path.h __flat_map/flat_map.h + __flat_map/flat_multimap.h __flat_map/key_value_iterator.h + __flat_map/sorted_equivalent.h __flat_map/sorted_unique.h + __flat_map/utils.h __format/buffer.h __format/concepts.h __format/container_adaptor.h diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h index ab53b7a285ca4..a0594ed9dc411 100644 --- a/libcxx/include/__flat_map/flat_map.h +++ b/libcxx/include/__flat_map/flat_map.h @@ -29,9 +29,11 @@ #include <__cstddef/ptrdiff_t.h> #include <__flat_map/key_value_iterator.h> #include <__flat_map/sorted_unique.h> +#include <__flat_map/utils.h> #include <__functional/invoke.h> #include <__functional/is_transparent.h> #include <__functional/operations.h> +#include <__fwd/vector.h> #include <__iterator/concepts.h> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> @@ -131,7 +133,7 @@ class flat_map { _LIBCPP_HIDE_FROM_ABI static constexpr bool __allocator_ctor_constraint = _And, uses_allocator>::value; - _LIBCPP_HIDE_FROM_ABI static constexpr bool __is_compare_transparent = __is_transparent_v<_Compare, _Compare>; + _LIBCPP_HIDE_FROM_ABI static constexpr bool __is_compare_transparent = __is_transparent_v<_Compare>; public: // [flat.map.cons], construct/copy/destroy @@ -153,7 +155,7 @@ class flat_map { # if _LIBCPP_HAS_EXCEPTIONS } catch (...) { __other.clear(); - // gcc does not like the `throw` keyword in a conditional noexcept function + // gcc does not like the `throw` keyword in a conditionally noexcept function if constexpr (!(is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_MappedContainer> && is_nothrow_move_constructible_v<_Compare>)) { throw; @@ -518,16 +520,16 @@ class flat_map { return emplace_hint(__hint, std::move(__x)); } - template - requires is_constructible_v, _Pp> - _LIBCPP_HIDE_FROM_ABI pair insert(_Pp&& __x) { - return emplace(std::forward<_Pp>(__x)); + template + requires is_constructible_v, _PairLike> + _LIBCPP_HIDE_FROM_ABI pair insert(_PairLike&& __x) { + return emplace(std::forward<_PairLike>(__x)); } - template - requires is_constructible_v, _Pp> - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _Pp&& __x) { - return emplace_hint(__hint, std::forward<_Pp>(__x)); + template + requires is_constructible_v, _PairLike> + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) { + return emplace_hint(__hint, std::forward<_PairLike>(__x)); } template @@ -860,22 +862,10 @@ class flat_map { __containers_.values.erase(__containers_.values.begin() + __dist, __containers_.values.end()); } - template - _LIBCPP_HIDE_FROM_ABI size_type __append(_InputIterator __first, _Sentinel __last) { - size_type __num_of_appended = 0; - for (; __first != __last; ++__first) { - value_type __kv = *__first; - __containers_.keys.insert(__containers_.keys.end(), std::move(__kv.first)); - __containers_.values.insert(__containers_.values.end(), std::move(__kv.second)); - ++__num_of_appended; - } - return __num_of_appended; - } - template _LIBCPP_HIDE_FROM_ABI void __append_sort_merge_unique(_InputIterator __first, _Sentinel __last) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); - size_t __num_of_appended = __append(std::move(__first), std::move(__last)); + size_t __num_of_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last)); if (__num_of_appended != 0) { auto __zv = ranges::views::zip(__containers_.keys, __containers_.values); auto __append_start_offset = __containers_.keys.size() - __num_of_appended; @@ -963,7 +953,8 @@ class flat_map { if (__key_it == __containers_.keys.end() || __compare_(__key, *__key_it)) { return pair( - __try_emplace_exact_hint( + __flat_map_utils::__emplace_exact_pos( + *this, std::move(__key_it), std::move(__mapped_it), std::forward<_KeyArg>(__key), @@ -989,10 +980,13 @@ class flat_map { _LIBCPP_HIDE_FROM_ABI pair __try_emplace_hint(const_iterator __hint, _Kp&& __key, _Args&&... __args) { if (__is_hint_correct(__hint, __key)) { if (__hint == cend() || __compare_(__key, __hint->first)) { - return { - __try_emplace_exact_hint( - __hint.__key_iter_, __hint.__mapped_iter_, std::forward<_Kp>(__key), std::forward<_Args>(__args)...), - true}; + return {__flat_map_utils::__emplace_exact_pos( + *this, + __hint.__key_iter_, + __hint.__mapped_iter_, + std::forward<_Kp>(__key), + std::forward<_Args>(__args)...), + true}; } else { // key equals auto __dist = __hint - cbegin(); @@ -1003,49 +997,6 @@ class flat_map { } } - template - _LIBCPP_HIDE_FROM_ABI iterator - __try_emplace_exact_hint(_IterK&& __it_key, _IterM&& __it_mapped, _KeyArg&& __key, _MArgs&&... __mapped_args) { - auto __on_key_failed = std::__make_exception_guard([&]() noexcept { - if constexpr (__container_traits<_KeyContainer>::__emplacement_has_strong_exception_safety_guarantee) { - // Nothing to roll back! - } else { - // we need to clear both because we don't know the state of our keys anymore - clear() /* noexcept */; - } - }); - auto __key_it = __containers_.keys.emplace(__it_key, std::forward<_KeyArg>(__key)); - __on_key_failed.__complete(); - - auto __on_value_failed = std::__make_exception_guard([&]() noexcept { - if constexpr (!__container_traits<_MappedContainer>::__emplacement_has_strong_exception_safety_guarantee) { - // we need to clear both because we don't know the state of our values anymore - clear() /* noexcept */; - } else { - // In this case, we know the values are just like before we attempted emplacement, - // and we also know that the keys have been emplaced successfully. Just roll back the keys. -# if _LIBCPP_HAS_EXCEPTIONS - try { -# endif // _LIBCPP_HAS_EXCEPTIONS - __containers_.keys.erase(__key_it); -# if _LIBCPP_HAS_EXCEPTIONS - } catch (...) { - // Now things are funky for real. We're failing to rollback the keys. - // Just give up and clear the whole thing. - // - // Also, swallow the exception that happened during the rollback and let the - // original value-emplacement exception propagate normally. - clear() /* noexcept */; - } -# endif // _LIBCPP_HAS_EXCEPTIONS - } - }); - auto __mapped_it = __containers_.values.emplace(__it_mapped, std::forward<_MArgs>(__mapped_args)...); - __on_value_failed.__complete(); - - return iterator(std::move(__key_it), std::move(__mapped_it)); - } - template _LIBCPP_HIDE_FROM_ABI pair __insert_or_assign(_Kp&& __key, _Mapped&& __mapped) { auto __r = try_emplace(std::forward<_Kp>(__key), std::forward<_Mapped>(__mapped)); @@ -1087,8 +1038,10 @@ class flat_map { friend typename flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type erase_if(flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate); + friend __flat_map_utils; + containers __containers_; - [[no_unique_address]] key_compare __compare_; + _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_; struct __key_equiv { _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {} @@ -1187,22 +1140,20 @@ template >, class _Allocator = allocator, class = __enable_if_t::value && __is_allocator<_Allocator>::value>> -flat_map(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator()) - -> flat_map< - __range_key_type<_Range>, - __range_mapped_type<_Range>, - _Compare, - vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>, - vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>; +flat_map(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator()) -> flat_map< + __range_key_type<_Range>, + __range_mapped_type<_Range>, + _Compare, + vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>, + vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>; template ::value>> -flat_map(from_range_t, _Range&&, _Allocator) - -> flat_map< - __range_key_type<_Range>, - __range_mapped_type<_Range>, - less<__range_key_type<_Range>>, - vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>, - vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>; +flat_map(from_range_t, _Range&&, _Allocator) -> flat_map< + __range_key_type<_Range>, + __range_mapped_type<_Range>, + less<__range_key_type<_Range>>, + vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>, + vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>; template > requires(!__is_allocator<_Compare>::value) diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h new file mode 100644 index 0000000000000..ea77fb5d79bd2 --- /dev/null +++ b/libcxx/include/__flat_map/flat_multimap.h @@ -0,0 +1,1010 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___FLAT_MAP_FLAT_MULTIMAP_H +#define _LIBCPP___FLAT_MAP_FLAT_MULTIMAP_H + +#include <__algorithm/lexicographical_compare_three_way.h> +#include <__algorithm/min.h> +#include <__algorithm/ranges_equal.h> +#include <__algorithm/ranges_equal_range.h> +#include <__algorithm/ranges_inplace_merge.h> +#include <__algorithm/ranges_is_sorted.h> +#include <__algorithm/ranges_lower_bound.h> +#include <__algorithm/ranges_partition_point.h> +#include <__algorithm/ranges_sort.h> +#include <__algorithm/ranges_unique.h> +#include <__algorithm/ranges_upper_bound.h> +#include <__algorithm/remove_if.h> +#include <__assert> +#include <__compare/synth_three_way.h> +#include <__concepts/convertible_to.h> +#include <__concepts/swappable.h> +#include <__config> +#include <__cstddef/byte.h> +#include <__cstddef/ptrdiff_t.h> +#include <__flat_map/key_value_iterator.h> +#include <__flat_map/sorted_equivalent.h> +#include <__flat_map/utils.h> +#include <__functional/invoke.h> +#include <__functional/is_transparent.h> +#include <__functional/operations.h> +#include <__fwd/vector.h> +#include <__iterator/concepts.h> +#include <__iterator/distance.h> +#include <__iterator/iterator_traits.h> +#include <__iterator/ranges_iterator_traits.h> +#include <__iterator/reverse_iterator.h> +#include <__memory/allocator_traits.h> +#include <__memory/uses_allocator.h> +#include <__memory/uses_allocator_construction.h> +#include <__ranges/access.h> +#include <__ranges/concepts.h> +#include <__ranges/container_compatible_range.h> +#include <__ranges/drop_view.h> +#include <__ranges/from_range.h> +#include <__ranges/ref_view.h> +#include <__ranges/size.h> +#include <__ranges/subrange.h> +#include <__ranges/zip_view.h> +#include <__type_traits/conjunction.h> +#include <__type_traits/container_traits.h> +#include <__type_traits/invoke.h> +#include <__type_traits/is_allocator.h> +#include <__type_traits/is_nothrow_constructible.h> +#include <__type_traits/is_same.h> +#include <__type_traits/maybe_const.h> +#include <__utility/exception_guard.h> +#include <__utility/move.h> +#include <__utility/pair.h> +#include <__utility/scope_guard.h> +#include <__vector/vector.h> +#include +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +#if _LIBCPP_STD_VER >= 23 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template , + class _KeyContainer = vector<_Key>, + class _MappedContainer = vector<_Tp>> +class flat_multimap { + template + friend class flat_multimap; + + static_assert(is_same_v<_Key, typename _KeyContainer::value_type>); + static_assert(is_same_v<_Tp, typename _MappedContainer::value_type>); + static_assert(!is_same_v<_KeyContainer, std::vector>, "vector is not a sequence container"); + static_assert(!is_same_v<_MappedContainer, std::vector>, "vector is not a sequence container"); + + template + using __iterator _LIBCPP_NODEBUG = __key_value_iterator; + +public: + // types + using key_type = _Key; + using mapped_type = _Tp; + using value_type = pair; + using key_compare = __type_identity_t<_Compare>; + using reference = pair; + using const_reference = pair; + using size_type = size_t; + using difference_type = ptrdiff_t; + using iterator = __iterator; // see [container.requirements] + using const_iterator = __iterator; // see [container.requirements] + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + using key_container_type = _KeyContainer; + using mapped_container_type = _MappedContainer; + + class value_compare { + private: + key_compare __comp_; + _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {} + friend flat_multimap; + + public: + _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + return __comp_(__x.first, __y.first); + } + }; + + struct containers { + key_container_type keys; + mapped_container_type values; + }; + +private: + template + _LIBCPP_HIDE_FROM_ABI static constexpr bool __allocator_ctor_constraint = + _And, uses_allocator>::value; + + _LIBCPP_HIDE_FROM_ABI static constexpr bool __is_compare_transparent = __is_transparent_v<_Compare>; + +public: + // [flat.map.cons], construct/copy/destroy + _LIBCPP_HIDE_FROM_ABI flat_multimap() noexcept( + is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_MappedContainer> && + is_nothrow_default_constructible_v<_Compare>) + : __containers_(), __compare_() {} + + _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap&) = default; + + // The copy/move constructors are not specified in the spec, which means they should be defaulted. + // However, the move constructor can potentially leave a moved-from object in an inconsistent + // state if an exception is thrown. + _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other) noexcept( + is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_MappedContainer> && + is_nothrow_move_constructible_v<_Compare>) +# if _LIBCPP_HAS_EXCEPTIONS + try +# endif // _LIBCPP_HAS_EXCEPTIONS + : __containers_(std::move(__other.__containers_)), __compare_(std::move(__other.__compare_)) { + __other.clear(); +# if _LIBCPP_HAS_EXCEPTIONS + } catch (...) { + __other.clear(); + // gcc does not like the `throw` keyword in a conditionally noexcept function + if constexpr (!(is_nothrow_move_constructible_v<_KeyContainer> && + is_nothrow_move_constructible_v<_MappedContainer> && is_nothrow_move_constructible_v<_Compare>)) { + throw; + } +# endif // _LIBCPP_HAS_EXCEPTIONS + } + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap& __other, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_tag{}, + __alloc, + __other.__containers_.keys, + __other.__containers_.values, + __other.__compare_) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other, const _Allocator& __alloc) +# if _LIBCPP_HAS_EXCEPTIONS + try +# endif // _LIBCPP_HAS_EXCEPTIONS + : flat_multimap(__ctor_uses_allocator_tag{}, + __alloc, + std::move(__other.__containers_.keys), + std::move(__other.__containers_.values), + std::move(__other.__compare_)) { + __other.clear(); +# if _LIBCPP_HAS_EXCEPTIONS + } catch (...) { + __other.clear(); + throw; +# endif // _LIBCPP_HAS_EXCEPTIONS + } + + _LIBCPP_HIDE_FROM_ABI flat_multimap( + key_container_type __key_cont, mapped_container_type __mapped_cont, const key_compare& __comp = key_compare()) + : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), + "flat_multimap keys and mapped containers have different size"); + __sort(); + } + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap( + const key_container_type& __key_cont, const mapped_container_type& __mapped_cont, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), + "flat_multimap keys and mapped containers have different size"); + __sort(); + } + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI + flat_multimap(const key_container_type& __key_cont, + const mapped_container_type& __mapped_cont, + const key_compare& __comp, + const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), + "flat_multimap keys and mapped containers have different size"); + __sort(); + } + + _LIBCPP_HIDE_FROM_ABI + flat_multimap(sorted_equivalent_t, + key_container_type __key_cont, + mapped_container_type __mapped_cont, + const key_compare& __comp = key_compare()) + : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), + "flat_multimap keys and mapped containers have different size"); + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted"); + } + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI + flat_multimap(sorted_equivalent_t, + const key_container_type& __key_cont, + const mapped_container_type& __mapped_cont, + const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), + "flat_multimap keys and mapped containers have different size"); + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted"); + } + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI + flat_multimap(sorted_equivalent_t, + const key_container_type& __key_cont, + const mapped_container_type& __mapped_cont, + const key_compare& __comp, + const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), + "flat_multimap keys and mapped containers have different size"); + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted"); + } + + _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const key_compare& __comp) : __containers_(), __compare_(__comp) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(const key_compare& __comp, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {} + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI + flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) + : __containers_(), __compare_(__comp) { + insert(__first, __last); + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) + _LIBCPP_HIDE_FROM_ABI + flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) { + insert(__first, __last); + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) + _LIBCPP_HIDE_FROM_ABI flat_multimap(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) { + insert(__first, __last); + } + + template <_ContainerCompatibleRange _Range> + _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t __fr, _Range&& __rg) + : flat_multimap(__fr, std::forward<_Range>(__rg), key_compare()) {} + + template <_ContainerCompatibleRange _Range, class _Allocator> + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) { + insert_range(std::forward<_Range>(__rg)); + } + + template <_ContainerCompatibleRange _Range> + _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multimap(__comp) { + insert_range(std::forward<_Range>(__rg)); + } + + template <_ContainerCompatibleRange _Range, class _Allocator> + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) { + insert_range(std::forward<_Range>(__rg)); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI flat_multimap( + sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) + : __containers_(), __compare_(__comp) { + insert(sorted_equivalent, __first, __last); + } + template + requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) + _LIBCPP_HIDE_FROM_ABI + flat_multimap(sorted_equivalent_t, + _InputIterator __first, + _InputIterator __last, + const key_compare& __comp, + const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) { + insert(sorted_equivalent, __first, __last); + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) + _LIBCPP_HIDE_FROM_ABI + flat_multimap(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) { + insert(sorted_equivalent, __first, __last); + } + + _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list __il, const key_compare& __comp = key_compare()) + : flat_multimap(__il.begin(), __il.end(), __comp) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI + flat_multimap(initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) + : flat_multimap(__il.begin(), __il.end(), __comp, __alloc) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list __il, const _Allocator& __alloc) + : flat_multimap(__il.begin(), __il.end(), __alloc) {} + + _LIBCPP_HIDE_FROM_ABI + flat_multimap(sorted_equivalent_t, initializer_list __il, const key_compare& __comp = key_compare()) + : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap( + sorted_equivalent_t, initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) + : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(sorted_equivalent_t, initializer_list __il, const _Allocator& __alloc) + : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __alloc) {} + + _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(initializer_list __il) { + clear(); + insert(__il); + return *this; + } + + // copy/move assignment are not specified in the spec (defaulted) + // but move assignment can potentially leave moved from object in an inconsistent + // state if an exception is thrown + _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(const flat_multimap&) = default; + + _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(flat_multimap&& __other) noexcept( + is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_MappedContainer> && + is_nothrow_move_assignable_v<_Compare>) { + auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; }); + auto __clear_self_guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + __containers_ = std::move(__other.__containers_); + __compare_ = std::move(__other.__compare_); + __clear_self_guard.__complete(); + return *this; + } + + // iterators + _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { + return iterator(__containers_.keys.begin(), __containers_.values.begin()); + } + + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { + return const_iterator(__containers_.keys.begin(), __containers_.values.begin()); + } + + _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { + return iterator(__containers_.keys.end(), __containers_.values.end()); + } + + _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { + return const_iterator(__containers_.keys.end(), __containers_.values.end()); + } + + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } + + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); } + + // [flat.map.capacity], capacity + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __containers_.keys.empty(); } + + _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __containers_.keys.size(); } + + _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { + return std::min(__containers_.keys.max_size(), __containers_.values.max_size()); + } + + // [flat.map.modifiers], modifiers + template + requires is_constructible_v, _Args...> && is_move_constructible_v && + is_move_constructible_v + _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) { + std::pair __pair(std::forward<_Args>(__args)...); + auto __key_it = ranges::upper_bound(__containers_.keys, __pair.first, __compare_); + auto __mapped_it = __corresponding_mapped_it(*this, __key_it); + + return __flat_map_utils::__emplace_exact_pos( + *this, std::move(__key_it), std::move(__mapped_it), std::move(__pair.first), std::move(__pair.second)); + } + + template + requires is_constructible_v, _Args...> + _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) { + std::pair __pair(std::forward<_Args>(__args)...); + + auto __prev_larger = __hint != cbegin() && __compare_(__pair.first, (__hint - 1)->first); + auto __next_smaller = __hint != cend() && __compare_(__hint->first, __pair.first); + + auto __hint_distance = __hint.__key_iter_ - __containers_.keys.cbegin(); + auto __key_iter = __containers_.keys.begin() + __hint_distance; + auto __mapped_iter = __containers_.values.begin() + __hint_distance; + + if (!__prev_larger && !__next_smaller) [[likely]] { + // hint correct, just use exact hint iterators + } else if (__prev_larger && !__next_smaller) { + // the hint position is more to the right than the key should have been. + // we want to emplace the element to a position as right as possible + // e.g. Insert new element "2" in the following range + // 1, 1, 2, 2, 2, 3, 4, 6 + // ^ + // | + // hint + // We want to insert "2" after the last existing "2" + __key_iter = ranges::upper_bound(__containers_.keys.begin(), __key_iter, __pair.first, __compare_); + __mapped_iter = __corresponding_mapped_it(*this, __key_iter); + } else { + _LIBCPP_ASSERT_INTERNAL(!__prev_larger && __next_smaller, "this means that the multimap is not sorted"); + + // the hint position is more to the left than the key should have been. + // we want to emplace the element to a position as left as possible + // 1, 1, 2, 2, 2, 3, 4, 6 + // ^ + // | + // hint + // We want to insert "2" before the first existing "2" + __key_iter = ranges::lower_bound(__key_iter, __containers_.keys.end(), __pair.first, __compare_); + __mapped_iter = __corresponding_mapped_it(*this, __key_iter); + } + return __flat_map_utils::__emplace_exact_pos( + *this, __key_iter, __mapped_iter, std::move(__pair.first), std::move(__pair.second)); + } + + _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); } + + _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); } + + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) { + return emplace_hint(__hint, __x); + } + + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) { + return emplace_hint(__hint, std::move(__x)); + } + + template + requires is_constructible_v, _PairLike> + _LIBCPP_HIDE_FROM_ABI iterator insert(_PairLike&& __x) { + return emplace(std::forward<_PairLike>(__x)); + } + + template + requires is_constructible_v, _PairLike> + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) { + return emplace_hint(__hint, std::forward<_PairLike>(__x)); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) { + if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { + __reserve(__last - __first); + } + __append_sort_merge(std::move(__first), std::move(__last)); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { + if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { + __reserve(__last - __first); + } + + __append_sort_merge(std::move(__first), std::move(__last)); + } + + template <_ContainerCompatibleRange _Range> + _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) { + if constexpr (ranges::sized_range<_Range>) { + __reserve(ranges::size(__range)); + } + + __append_sort_merge(ranges::begin(__range), ranges::end(__range)); + } + + _LIBCPP_HIDE_FROM_ABI void insert(initializer_list __il) { insert(__il.begin(), __il.end()); } + + _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list __il) { + insert(sorted_equivalent, __il.begin(), __il.end()); + } + + _LIBCPP_HIDE_FROM_ABI containers extract() && { + auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; }); + auto __ret = std::move(__containers_); + return __ret; + } + + _LIBCPP_HIDE_FROM_ABI void replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) { + _LIBCPP_ASSERT_VALID_INPUT_RANGE( + __key_cont.size() == __mapped_cont.size(), "flat_multimap keys and mapped containers have different size"); + + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__key_cont), "Key container is not sorted"); + auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + __containers_.keys = std::move(__key_cont); + __containers_.values = std::move(__mapped_cont); + __guard.__complete(); + } + + _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) { + return __erase(__position.__key_iter_, __position.__mapped_iter_); + } + + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position) { + return __erase(__position.__key_iter_, __position.__mapped_iter_); + } + + _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) { + auto [__first, __last] = equal_range(__x); + auto __res = __last - __first; + erase(__first, __last); + return __res; + } + + template + requires(__is_compare_transparent && !is_convertible_v<_Kp &&, iterator> && + !is_convertible_v<_Kp &&, const_iterator>) + _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) { + auto [__first, __last] = equal_range(__x); + auto __res = __last - __first; + erase(__first, __last); + return __res; + } + + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) { + auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + auto __key_it = __containers_.keys.erase(__first.__key_iter_, __last.__key_iter_); + auto __mapped_it = __containers_.values.erase(__first.__mapped_iter_, __last.__mapped_iter_); + __on_failure.__complete(); + return iterator(std::move(__key_it), std::move(__mapped_it)); + } + + _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __y) noexcept { + // warning: The spec has unconditional noexcept, which means that + // if any of the following functions throw an exception, + // std::terminate will be called + ranges::swap(__compare_, __y.__compare_); + ranges::swap(__containers_.keys, __y.__containers_.keys); + ranges::swap(__containers_.values, __y.__containers_.values); + } + + _LIBCPP_HIDE_FROM_ABI void clear() noexcept { + __containers_.keys.clear(); + __containers_.values.clear(); + } + + // observers + _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return value_compare(__compare_); } + + _LIBCPP_HIDE_FROM_ABI const key_container_type& keys() const noexcept { return __containers_.keys; } + _LIBCPP_HIDE_FROM_ABI const mapped_container_type& values() const noexcept { return __containers_.values; } + + // map operations + _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); } + + _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) { + return __find_impl(*this, __x); + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const { + return __find_impl(*this, __x); + } + + _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const { + auto [__first, __last] = equal_range(__x); + return __last - __first; + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const { + auto [__first, __last] = equal_range(__x); + return __last - __first; + } + + _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const { + return find(__x) != end(); + } + + _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { return __lower_bound(*this, __x); } + + _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const { + return __lower_bound(*this, __x); + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) { + return __lower_bound(*this, __x); + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const { + return __lower_bound(*this, __x); + } + + _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { return __upper_bound(*this, __x); } + + _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const { + return __upper_bound(*this, __x); + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) { + return __upper_bound(*this, __x); + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const { + return __upper_bound(*this, __x); + } + + _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) { + return __equal_range_impl(*this, __x); + } + + _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) const { + return __equal_range_impl(*this, __x); + } + + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) { + return __equal_range_impl(*this, __x); + } + template + requires __is_compare_transparent + _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) const { + return __equal_range_impl(*this, __x); + } + + friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multimap& __x, const flat_multimap& __y) { + return ranges::equal(__x, __y); + } + + friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multimap& __x, const flat_multimap& __y) { + return std::lexicographical_compare_three_way( + __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); + } + + friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __x, flat_multimap& __y) noexcept { __x.swap(__y); } + +private: + struct __ctor_uses_allocator_tag { + explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_tag() = default; + }; + struct __ctor_uses_allocator_empty_tag { + explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_empty_tag() = default; + }; + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI + flat_multimap(__ctor_uses_allocator_tag, + const _Allocator& __alloc, + _KeyCont&& __key_cont, + _MappedCont&& __mapped_cont, + _CompArg&&... __comp) + : __containers_{.keys = std::make_obj_using_allocator( + __alloc, std::forward<_KeyCont>(__key_cont)), + .values = std::make_obj_using_allocator( + __alloc, std::forward<_MappedCont>(__mapped_cont))}, + __compare_(std::forward<_CompArg>(__comp)...) {} + + template + requires __allocator_ctor_constraint<_Allocator> + _LIBCPP_HIDE_FROM_ABI flat_multimap(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp) + : __containers_{.keys = std::make_obj_using_allocator(__alloc), + .values = std::make_obj_using_allocator(__alloc)}, + __compare_(std::forward<_CompArg>(__comp)...) {} + + _LIBCPP_HIDE_FROM_ABI bool __is_sorted(auto&& __key_container) const { + return ranges::is_sorted(__key_container, __compare_); + } + + _LIBCPP_HIDE_FROM_ABI void __sort() { + auto __zv = ranges::views::zip(__containers_.keys, __containers_.values); + ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); }); + } + + template + _LIBCPP_HIDE_FROM_ABI static auto __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) { + return __self.__containers_.values.begin() + + static_cast>( + ranges::distance(__self.__containers_.keys.begin(), __key_iter)); + } + + template + _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_InputIterator __first, _Sentinel __last) { + auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + size_t __num_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last)); + if (__num_appended != 0) { + auto __zv = ranges::views::zip(__containers_.keys, __containers_.values); + auto __append_start_offset = __containers_.keys.size() - __num_appended; + auto __end = __zv.end(); + auto __compare_key = [this](const auto& __p1, const auto& __p2) { + return __compare_(std::get<0>(__p1), std::get<0>(__p2)); + }; + if constexpr (!_WasSorted) { + ranges::sort(__zv.begin() + __append_start_offset, __end, __compare_key); + } else { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT( + __is_sorted(__containers_.keys | ranges::views::drop(__append_start_offset)), + "Key container is not sorted"); + } + ranges::inplace_merge(__zv.begin(), __zv.begin() + __append_start_offset, __end, __compare_key); + } + __on_failure.__complete(); + } + + template + _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) { + auto __it = __self.lower_bound(__key); + auto __last = __self.end(); + if (__it == __last || __self.__compare_(__key, __it->first)) { + return __last; + } + return __it; + } + + template + _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { + auto [__key_first, __key_last] = ranges::equal_range(__self.__containers_.keys, __key, __self.__compare_); + + using __iterator_type = ranges::iterator_t; + return std::make_pair(__iterator_type(__key_first, __corresponding_mapped_it(__self, __key_first)), + __iterator_type(__key_last, __corresponding_mapped_it(__self, __key_last))); + } + + template + _LIBCPP_HIDE_FROM_ABI static _Res __lower_bound(_Self&& __self, _Kp& __x) { + auto __key_iter = ranges::lower_bound(__self.__containers_.keys, __x, __self.__compare_); + auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter); + return _Res(std::move(__key_iter), std::move(__mapped_iter)); + } + + template + _LIBCPP_HIDE_FROM_ABI static _Res __upper_bound(_Self&& __self, _Kp& __x) { + auto __key_iter = ranges::upper_bound(__self.__containers_.keys, __x, __self.__compare_); + auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter); + return _Res(std::move(__key_iter), std::move(__mapped_iter)); + } + + _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) { + if constexpr (requires { __containers_.keys.reserve(__size); }) { + __containers_.keys.reserve(__size); + } + + if constexpr (requires { __containers_.values.reserve(__size); }) { + __containers_.values.reserve(__size); + } + } + + template + _LIBCPP_HIDE_FROM_ABI iterator __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) { + auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + auto __key_iter = __containers_.keys.erase(__key_iter_to_remove); + auto __mapped_iter = __containers_.values.erase(__mapped_iter_to_remove); + __on_failure.__complete(); + return iterator(std::move(__key_iter), std::move(__mapped_iter)); + } + + template + friend typename flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type + erase_if(flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate); + + friend __flat_map_utils; + + containers __containers_; + _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_; + + struct __key_equiv { + _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x)); + } + key_compare __comp_; + }; +}; + +template > + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + !__is_allocator<_MappedContainer>::value && + is_invocable_v) +flat_multimap(_KeyContainer, _MappedContainer, _Compare = _Compare()) + -> flat_multimap; + +template + requires(uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator> && + !__is_allocator<_KeyContainer>::value && !__is_allocator<_MappedContainer>::value) +flat_multimap(_KeyContainer, _MappedContainer, _Allocator) + -> flat_multimap, + _KeyContainer, + _MappedContainer>; + +template + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + !__is_allocator<_MappedContainer>::value && uses_allocator_v<_KeyContainer, _Allocator> && + uses_allocator_v<_MappedContainer, _Allocator> && + is_invocable_v) +flat_multimap(_KeyContainer, _MappedContainer, _Compare, _Allocator) + -> flat_multimap; + +template > + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + !__is_allocator<_MappedContainer>::value && + is_invocable_v) +flat_multimap(sorted_equivalent_t, _KeyContainer, _MappedContainer, _Compare = _Compare()) + -> flat_multimap; + +template + requires(uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator> && + !__is_allocator<_KeyContainer>::value && !__is_allocator<_MappedContainer>::value) +flat_multimap(sorted_equivalent_t, _KeyContainer, _MappedContainer, _Allocator) + -> flat_multimap, + _KeyContainer, + _MappedContainer>; + +template + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + !__is_allocator<_MappedContainer>::value && uses_allocator_v<_KeyContainer, _Allocator> && + uses_allocator_v<_MappedContainer, _Allocator> && + is_invocable_v) +flat_multimap(sorted_equivalent_t, _KeyContainer, _MappedContainer, _Compare, _Allocator) + -> flat_multimap; + +template >> + requires(__has_input_iterator_category<_InputIterator>::value && !__is_allocator<_Compare>::value) +flat_multimap(_InputIterator, _InputIterator, _Compare = _Compare()) + -> flat_multimap<__iter_key_type<_InputIterator>, __iter_mapped_type<_InputIterator>, _Compare>; + +template >> + requires(__has_input_iterator_category<_InputIterator>::value && !__is_allocator<_Compare>::value) +flat_multimap(sorted_equivalent_t, _InputIterator, _InputIterator, _Compare = _Compare()) + -> flat_multimap<__iter_key_type<_InputIterator>, __iter_mapped_type<_InputIterator>, _Compare>; + +template >, + class _Allocator = allocator, + class = __enable_if_t::value && __is_allocator<_Allocator>::value>> +flat_multimap(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator()) -> flat_multimap< + __range_key_type<_Range>, + __range_mapped_type<_Range>, + _Compare, + vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>, + vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>; + +template ::value>> +flat_multimap(from_range_t, _Range&&, _Allocator) -> flat_multimap< + __range_key_type<_Range>, + __range_mapped_type<_Range>, + less<__range_key_type<_Range>>, + vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>, + vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>; + +template > + requires(!__is_allocator<_Compare>::value) +flat_multimap(initializer_list>, _Compare = _Compare()) -> flat_multimap<_Key, _Tp, _Compare>; + +template > + requires(!__is_allocator<_Compare>::value) +flat_multimap(sorted_equivalent_t, initializer_list>, _Compare = _Compare()) + -> flat_multimap<_Key, _Tp, _Compare>; + +template +struct uses_allocator, _Allocator> + : bool_constant && uses_allocator_v<_MappedContainer, _Allocator>> {}; + +template +_LIBCPP_HIDE_FROM_ABI typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type +erase_if(flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_multimap, _Predicate __pred) { + auto __zv = ranges::views::zip(__flat_multimap.__containers_.keys, __flat_multimap.__containers_.values); + auto __first = __zv.begin(); + auto __last = __zv.end(); + auto __guard = std::__make_exception_guard([&] { __flat_multimap.clear(); }); + auto __it = std::remove_if(__first, __last, [&](auto&& __zipped) -> bool { + using _Ref = typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::const_reference; + return __pred(_Ref(std::get<0>(__zipped), std::get<1>(__zipped))); + }); + auto __res = __last - __it; + auto __offset = __it - __first; + + const auto __erase_container = [&](auto& __cont) { __cont.erase(__cont.begin() + __offset, __cont.end()); }; + + __erase_container(__flat_multimap.__containers_.keys); + __erase_container(__flat_multimap.__containers_.values); + + __guard.__complete(); + return __res; +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___FLAT_MAP_FLAT_MULTIMAP_H diff --git a/libcxx/include/__flat_map/sorted_equivalent.h b/libcxx/include/__flat_map/sorted_equivalent.h new file mode 100644 index 0000000000000..1db935cc6ee75 --- /dev/null +++ b/libcxx/include/__flat_map/sorted_equivalent.h @@ -0,0 +1,31 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef _LIBCPP___FLAT_MAP_SORTED_EQUIVALENT_H +#define _LIBCPP___FLAT_MAP_SORTED_EQUIVALENT_H + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if _LIBCPP_STD_VER >= 23 + +_LIBCPP_BEGIN_NAMESPACE_STD + +struct sorted_equivalent_t { + explicit sorted_equivalent_t() = default; +}; +inline constexpr sorted_equivalent_t sorted_equivalent{}; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER >= 23 + +#endif // _LIBCPP___FLAT_MAP_SORTED_EQUIVALENT_H diff --git a/libcxx/include/__flat_map/utils.h b/libcxx/include/__flat_map/utils.h new file mode 100644 index 0000000000000..acb7dca7ffe96 --- /dev/null +++ b/libcxx/include/__flat_map/utils.h @@ -0,0 +1,103 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___FLAT_MAP_UTILS_H +#define _LIBCPP___FLAT_MAP_UTILS_H + +#include <__config> +#include <__type_traits/container_traits.h> +#include <__utility/exception_guard.h> +#include <__utility/forward.h> +#include <__utility/move.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +#if _LIBCPP_STD_VER >= 23 + +_LIBCPP_BEGIN_NAMESPACE_STD + +// These utilities are defined in a class instead of a namespace so that this class can be befriended more easily. +struct __flat_map_utils { + // Emplace a {key: value} into a flat_{multi}map, at the exact position that + // __it_key and __it_mapped point to, assuming that the key is not already present in the map. + // When an exception is thrown during the emplacement, the function will try its best to + // roll back the changes it made to the map. If it cannot roll back the changes, it will + // clear the map. + template + _LIBCPP_HIDE_FROM_ABI static typename _Map::iterator __emplace_exact_pos( + _Map& __map, _IterK&& __it_key, _IterM&& __it_mapped, _KeyArg&& __key, _MArgs&&... __mapped_args) { + auto __on_key_failed = std::__make_exception_guard([&]() noexcept { + using _KeyContainer = typename _Map::key_container_type; + if constexpr (__container_traits<_KeyContainer>::__emplacement_has_strong_exception_safety_guarantee) { + // Nothing to roll back! + } else { + // we need to clear both because we don't know the state of our keys anymore + __map.clear() /* noexcept */; + } + }); + auto __key_it = __map.__containers_.keys.emplace(__it_key, std::forward<_KeyArg>(__key)); + __on_key_failed.__complete(); + + auto __on_value_failed = std::__make_exception_guard([&]() noexcept { + using _MappedContainer = typename _Map::mapped_container_type; + if constexpr (!__container_traits<_MappedContainer>::__emplacement_has_strong_exception_safety_guarantee) { + // we need to clear both because we don't know the state of our values anymore + __map.clear() /* noexcept */; + } else { + // In this case, we know the values are just like before we attempted emplacement, + // and we also know that the keys have been emplaced successfully. Just roll back the keys. +# if _LIBCPP_HAS_EXCEPTIONS + try { +# endif // _LIBCPP_HAS_EXCEPTIONS + __map.__containers_.keys.erase(__key_it); +# if _LIBCPP_HAS_EXCEPTIONS + } catch (...) { + // Now things are funky for real. We're failing to rollback the keys. + // Just give up and clear the whole thing. + // + // Also, swallow the exception that happened during the rollback and let the + // original value-emplacement exception propagate normally. + __map.clear() /* noexcept */; + } +# endif // _LIBCPP_HAS_EXCEPTIONS + } + }); + auto __mapped_it = __map.__containers_.values.emplace(__it_mapped, std::forward<_MArgs>(__mapped_args)...); + __on_value_failed.__complete(); + + return typename _Map::iterator(std::move(__key_it), std::move(__mapped_it)); + } + + // TODO: We could optimize this, see + // https://github.com/llvm/llvm-project/issues/108624 + template + _LIBCPP_HIDE_FROM_ABI static typename _Map::size_type + __append(_Map& __map, _InputIterator __first, _Sentinel __last) { + typename _Map::size_type __num_appended = 0; + for (; __first != __last; ++__first) { + typename _Map::value_type __kv = *__first; + __map.__containers_.keys.insert(__map.__containers_.keys.end(), std::move(__kv.first)); + __map.__containers_.values.insert(__map.__containers_.values.end(), std::move(__kv.second)); + ++__num_appended; + } + return __num_appended; + } +}; +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_POP_MACROS + +#endif // #define _LIBCPP___FLAT_MAP_UTILS_H diff --git a/libcxx/include/__functional/is_transparent.h b/libcxx/include/__functional/is_transparent.h index b2d62f2e3ead8..567df1a662f54 100644 --- a/libcxx/include/__functional/is_transparent.h +++ b/libcxx/include/__functional/is_transparent.h @@ -21,11 +21,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 14 -template +template inline const bool __is_transparent_v = false; -template -inline const bool __is_transparent_v<_Tp, _Up, __void_t > = true; +template +inline const bool __is_transparent_v<_Tp, _Key, __void_t > = true; #endif diff --git a/libcxx/include/flat_map b/libcxx/include/flat_map index dbe5d8ee8f8c3..2552450081734 100644 --- a/libcxx/include/flat_map +++ b/libcxx/include/flat_map @@ -35,6 +35,25 @@ namespace std { class Predicate> typename flat_map::size_type erase_if(flat_map& c, Predicate pred); + + // [flat.multimap], class template flat_multimap + template, + class KeyContainer = vector, class MappedContainer = vector> + class flat_multimap; + + struct sorted_equivalent_t { explicit sorted_equivalent_t() = default; }; + inline constexpr sorted_equivalent_t sorted_equivalent{}; + + template + struct uses_allocator, + Allocator>; + + // [flat.multimap.erasure], erasure for flat_multimap + template + typename flat_multimap::size_type + erase_if(flat_multimap& c, Predicate pred); */ #if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) @@ -44,6 +63,8 @@ namespace std { # if _LIBCPP_STD_VER >= 23 # include <__flat_map/flat_map.h> +# include <__flat_map/flat_multimap.h> +# include <__flat_map/sorted_equivalent.h> # include <__flat_map/sorted_unique.h> # endif diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 6c2fb8dc3940b..4bae02137b37b 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1244,9 +1244,20 @@ module std [system] { } module flat_map { - module flat_map { header "__flat_map/flat_map.h" } + module flat_map { + header "__flat_map/flat_map.h" + export std.vector.vector + export std.vector.fwd + } + module flat_multimap { + header "__flat_map/flat_multimap.h" + export std.vector.vector + export std.vector.fwd + } module key_value_iterator { header "__flat_map/key_value_iterator.h" } + module sorted_equivalent { header "__flat_map/sorted_equivalent.h" } module sorted_unique { header "__flat_map/sorted_unique.h" } + module utils { header "__flat_map/utils.h" } header "flat_map" export * diff --git a/libcxx/include/version b/libcxx/include/version index 57d6ec629d27c..29a71ed574e56 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -101,6 +101,8 @@ __cpp_lib_execution 201902L 201603L // C++17 __cpp_lib_expected 202211L __cpp_lib_filesystem 201703L +__cpp_lib_flat_map 202207L +__cpp_lib_flat_set 202207L __cpp_lib_format 202110L __cpp_lib_format_path 202403L __cpp_lib_format_ranges 202207L @@ -480,6 +482,8 @@ __cpp_lib_void_t 201411L # define __cpp_lib_constexpr_typeinfo 202106L # define __cpp_lib_containers_ranges 202202L # define __cpp_lib_expected 202211L +# define __cpp_lib_flat_map 202207L +// # define __cpp_lib_flat_set 202207L # define __cpp_lib_format_ranges 202207L // # define __cpp_lib_formatters 202302L # define __cpp_lib_forward_like 202207L diff --git a/libcxx/modules/std/flat_map.inc b/libcxx/modules/std/flat_map.inc index 6a86229bceaba..e9521749dc4a8 100644 --- a/libcxx/modules/std/flat_map.inc +++ b/libcxx/modules/std/flat_map.inc @@ -20,8 +20,6 @@ export namespace std { // [flat.map.erasure], erasure for flat_map using std::erase_if; -#endif // _LIBCPP_STD_VER >= 23 -#if 0 // [flat.multimap], class template flat_multimap using std::flat_multimap; @@ -29,5 +27,5 @@ export namespace std { using std::sorted_equivalent_t; // [flat.multimap.erasure], erasure for flat_multimap -#endif +#endif // _LIBCPP_STD_VER >= 23 } // namespace std diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.input_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.input_range.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.input_range.pass.cpp rename to libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.input_range.pass.cpp diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.sorted_unique.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.sorted_unique.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.sorted_unique.pass.cpp rename to libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.sorted_unique.pass.cpp diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp new file mode 100644 index 0000000000000..504f36fcd00b8 --- /dev/null +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp @@ -0,0 +1,66 @@ +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: has-unix-headers +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: libcpp-hardening-mode=none +// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing + +// + +// flat_multimap(key_container_type , mapped_container_type , const key_compare& __comp = key_compare()) +// flat_multimap(const key_container_type& , const mapped_container_type& , const _Allocator& ) +// flat_multimap(const key_container_type& , const mapped_container_type& , const key_compare&, const _Allocator& ) +// void replace(key_container_type&& , mapped_container_type&&) +// + +#include +#include +#include +#include + +#include "check_assertion.h" + +int main(int, char**) { + using M = std::flat_multimap; + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { M m({1, 2, 3}, {4}); }()), "flat_multimap keys and mapped containers have different size"); + + TEST_LIBCPP_ASSERT_FAILURE(([] { M m({1, 2, 3}, {4}, std::less{}); }()), + "flat_multimap keys and mapped containers have different size"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{1, 2, 3}; + const std::vector values{4}; + const std::allocator alloc{}; + M m(keys, values, alloc); + }()), + "flat_multimap keys and mapped containers have different size"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{1, 2, 3}; + const std::vector values{4}; + const std::less key_compare{}; + const std::allocator alloc{}; + M m(keys, values, key_compare, alloc); + }()), + "flat_multimap keys and mapped containers have different size"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::vector keys{1, 2, 3}; + std::vector values{4}; + M m; + m.replace(std::move(keys), std::move(values)); + }()), + "flat_multimap keys and mapped containers have different size"); + + return 0; +} diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp new file mode 100644 index 0000000000000..6b8ad3c7ac9aa --- /dev/null +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp @@ -0,0 +1,225 @@ +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: has-unix-headers +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: libcpp-hardening-mode=none +// REQUIRES: libcpp-hardening-mode=debug +// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing + +// + +// flat_multimap(key_container_type , mapped_container_type , const key_compare& __comp = key_compare()) +// flat_multimap(const key_container_type& , const mapped_container_type& , const _Allocator& ) +// flat_multimap(const key_container_type& , const mapped_container_type& , const key_compare&, const _Allocator& ) +// void replace(key_container_type&& , mapped_container_type&&) +// + +#include +#include +#include +#include +#include +#include + +#include "check_assertion.h" + +int main(int, char**) { + using M = std::flat_multimap; + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { M m(std::sorted_equivalent, {2, 2, 1}, {4, 5, 6}); }()), "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { M m(std::sorted_equivalent, {4, 2, 3}, {4, 5, 6}); }()), "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { M m(std::sorted_equivalent, {2, 2, 1}, {4, 5, 6}, std::less{}); }()), "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { M m(std::sorted_equivalent, {4, 2, 3}, {4, 5, 6}, std::less{}); }()), "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{2, 2, 1}; + const std::vector values{4, 5, 6}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, keys, values, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{4, 2, 3}; + const std::vector values{4, 5, 6}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, keys, values, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{2, 2, 1}; + const std::vector values{4, 5, 6}; + const std::allocator alloc{}; + const std::less comp{}; + M m(std::sorted_equivalent, keys, values, comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{4, 2, 3}; + const std::vector values{4, 5, 6}; + const std::allocator alloc{}; + const std::less comp{}; + M m(std::sorted_equivalent, keys, values, comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{2, 4}, {2, 5}, {1, 6}}; + const std::less comp{}; + M m(std::sorted_equivalent, v.begin(), v.end(), comp); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{4, 4}, {2, 5}, {3, 6}}; + const std::less comp{}; + M m(std::sorted_equivalent, v.begin(), v.end(), comp); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{2, 4}, {2, 5}, {1, 6}}; + const std::less comp{}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v.begin(), v.end(), comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{4, 4}, {2, 5}, {3, 6}}; + const std::less comp{}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v.begin(), v.end(), comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{2, 4}, {2, 5}, {1, 6}}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v.begin(), v.end(), alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{4, 4}, {2, 5}, {3, 6}}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v.begin(), v.end(), alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{2, 4}, {2, 5}, {1, 6}}; + const std::less comp{}; + M m(std::sorted_equivalent, v, comp); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{4, 4}, {2, 5}, {3, 6}}; + const std::less comp{}; + M m(std::sorted_equivalent, v, comp); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{2, 4}, {2, 5}, {1, 6}}; + const std::less comp{}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v, comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{4, 4}, {2, 5}, {3, 6}}; + const std::less comp{}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v, comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{2, 4}, {2, 5}, {1, 6}}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{4, 4}, {2, 5}, {3, 6}}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{2, 4}, {2, 5}, {1, 6}}; + M m; + m.insert(std::sorted_equivalent, v.begin(), v.end()); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector> v{{4, 4}, {2, 5}, {3, 6}}; + M m; + m.insert(std::sorted_equivalent, v.begin(), v.end()); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{2, 4}, {2, 5}, {1, 6}}; + M m; + m.insert(std::sorted_equivalent, v); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list> v{{4, 4}, {2, 5}, {3, 6}}; + M m; + m.insert(std::sorted_equivalent, v); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::vector keys{2, 1, 3}; + std::vector values{4, 5, 6}; + M m; + m.replace(std::move(keys), std::move(values)); + }()), + "Key container is not sorted"); + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp new file mode 100644 index 0000000000000..d9ee3fbd287b5 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// struct sorted_equivalent_t { explicit sorted_equivalent_t() = default; }; +// inline constexpr sorted_equivalent_t sorted_equivalent{}; + +#include +#include +#include +#include + +template +void implicit_test(T) {} + +template +concept HasImplicitDefaultCtor = requires { implicit_test({}); }; + +static_assert(std::is_default_constructible_v); +static_assert(std::is_trivially_default_constructible_v); +static_assert(!HasImplicitDefaultCtor); + +constexpr bool test() { + { + [[maybe_unused]] std::sorted_equivalent_t s; + } + { + [[maybe_unused]] std::same_as decltype(auto) s = (std::sorted_equivalent); + } + { + [[maybe_unused]] std::same_as decltype(auto) copy = std::sorted_equivalent; + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp index 5ecc2cf7c917b..05efe063c1e17 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp @@ -25,7 +25,9 @@ template void test() { - using M = std::flat_map, KeyContainer, ValueContainer>; + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; M m; ASSERT_SAME_TYPE(decltype(m.empty()), bool); ASSERT_NOEXCEPT(m.empty()); diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp index cc8016182dcb6..79b943b790d04 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp @@ -14,11 +14,7 @@ #include -#include "test_macros.h" - -int main(int, char**) { +void f() { std::flat_map c; c.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} - - return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp new file mode 100644 index 0000000000000..190d78f927f34 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// Test CTAD on cases where deduction should fail. + +#include +#include +#include +#include +#include + +struct NotAnAllocator { + friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; } +}; + +using P = std::pair; +using PC = std::pair; + +template +concept CanDeductFlatMap = requires { std::flat_map{std::declval()...}; }; + +static_assert(CanDeductFlatMap, std::vector>); + +// cannot deduce Key and T from nothing +static_assert(!CanDeductFlatMap<>); + +// cannot deduce Key and T from just (KeyContainer), even if it's a container of pairs +static_assert(!CanDeductFlatMap>>); + +// cannot deduce Key and T from just (KeyContainer, Allocator) +static_assert(!CanDeductFlatMap, std::allocator>>); + +// cannot deduce Key and T from just (Compare) +static_assert(!CanDeductFlatMap>); + +// cannot deduce Key and T from just (Compare, Allocator) +static_assert(!CanDeductFlatMap, std::allocator>); + +// cannot deduce Key and T from just (Allocator) +static_assert(!CanDeductFlatMap>); + +// cannot convert from some arbitrary unrelated type +static_assert(!CanDeductFlatMap); diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp index d01bee9aae9c0..009392feb3862 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp @@ -169,6 +169,24 @@ void test_iter_iter() { std::flat_map m(mo.cbegin(), mo.cend()); ASSERT_SAME_TYPE(decltype(m), decltype(mo)); } + { + std::pair source[3] = {{1, 1}, {2, 2}, {3, 3}}; + std::flat_map s = {source, source + 3}; // flat_map(InputIterator, InputIterator) + ASSERT_SAME_TYPE(decltype(s), std::flat_map); + assert(s.size() == 3); + } + { + std::pair source[3] = {{1, 1}, {2, 2}, {3, 3}}; + std::flat_map s{source, source + 3}; // flat_map(InputIterator, InputIterator) + ASSERT_SAME_TYPE(decltype(s), std::flat_map); + assert(s.size() == 3); + } + { + std::pair source[3] = {{1, 1}, {2, 2}, {3, 3}}; + std::flat_map s{std::sorted_unique, source, source + 3}; // flat_map(sorted_unique_t, InputIterator, InputIterator) + static_assert(std::is_same_v>); + assert(s.size() == 3); + } } void test_iter_iter_compare() { @@ -227,6 +245,19 @@ void test_initializer_list() { ASSERT_SAME_TYPE(decltype(m), std::flat_map); assert(std::ranges::equal(m, sorted_arr)); } + { + std::flat_map s = {std::make_pair(1, 'a')}; // flat_map(initializer_list>) + ASSERT_SAME_TYPE(decltype(s), std::flat_map); + assert(s.size() == 1); + } + { + using M = std::flat_map; + M m; + std::flat_map s = {std::make_pair(m, m)}; // flat_map(initializer_list>) + ASSERT_SAME_TYPE(decltype(s), std::flat_map); + assert(s.size() == 1); + assert(s[m] == m); + } } void test_initializer_list_compare() { @@ -305,38 +336,6 @@ int main(int, char**) { test_from_range_compare(); AssociativeContainerDeductionGuidesSfinaeAway>(); - { - std::flat_map s = {std::make_pair(1, 'a')}; // flat_map(initializer_list>) - ASSERT_SAME_TYPE(decltype(s), std::flat_map); - assert(s.size() == 1); - } - { - using M = std::flat_map; - M m; - std::flat_map s = {std::make_pair(m, m)}; // flat_map(initializer_list>) - ASSERT_SAME_TYPE(decltype(s), std::flat_map); - assert(s.size() == 1); - assert(s[m] == m); - } - - { - std::pair source[3] = {{1, 1}, {2, 2}, {3, 3}}; - std::flat_map s = {source, source + 3}; // flat_map(InputIterator, InputIterator) - ASSERT_SAME_TYPE(decltype(s), std::flat_map); - assert(s.size() == 3); - } - { - std::pair source[3] = {{1, 1}, {2, 2}, {3, 3}}; - std::flat_map s{source, source + 3}; // flat_map(InputIterator, InputIterator) - ASSERT_SAME_TYPE(decltype(s), std::flat_map); - assert(s.size() == 3); - } - { - std::pair source[3] = {{1, 1}, {2, 2}, {3, 3}}; - std::flat_map s{std::sorted_unique, source, source + 3}; // flat_map(sorted_unique_t, InputIterator, InputIterator) - static_assert(std::is_same_v>); - assert(s.size() == 3); - } return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp index 08244f01cb24e..ed20c1ae715b8 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp @@ -14,56 +14,12 @@ #include #include -#include #include -#include - -struct NotAnAllocator { - friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; } -}; using P = std::pair; using PC = std::pair; void test() { - { - // cannot deduce Key and T from just (KeyContainer), even if it's a container of pairs - std::vector> v; - std::flat_map s(v); - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } - { - // cannot deduce Key and T from just (KeyContainer, Allocator) - std::vector v; - std::flat_map s(v, std::allocator>()); - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } - { - // cannot deduce Key and T from nothing - std::flat_map m; - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } - { - // cannot deduce Key and T from just (Compare) - std::flat_map m(std::less{}); - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } - { - // cannot deduce Key and T from just (Compare, Allocator) - std::flat_map m(std::less{}, std::allocator{}); - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } - { - // cannot deduce Key and T from just (Allocator) - std::flat_map m(std::allocator{}); - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } - { - // cannot convert from some arbitrary unrelated type - NotAnAllocator a; - std::flat_map m(a); - // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}} - } { // cannot deduce that the inner braced things should be std::pair and not something else std::flat_map m{{1, 1L}, {2, 2L}, {3, 3L}}; diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp index ac24c8a8ac067..790dfa4a02ed5 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp @@ -37,10 +37,12 @@ int main(int, char**) { { using C = std::flat_map; static_assert(std::is_nothrow_default_constructible_v); + C c; } { using C = std::flat_map, std::vector>>; static_assert(std::is_nothrow_default_constructible_v); + C c; } #endif // _LIBCPP_VERSION { diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp index e3ab33a55d95b..1570b0fa14888 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp @@ -24,28 +24,32 @@ struct ThrowingDtorComp { bool operator()(const auto&, const auto&) const; - ~ThrowingDtorComp() noexcept(false); + ~ThrowingDtorComp() noexcept(false) {} }; int main(int, char**) { { using C = std::flat_map; static_assert(std::is_nothrow_destructible_v); + C c; } { using V = std::vector>; using C = std::flat_map, V, V>; static_assert(std::is_nothrow_destructible_v); + C c; } { using V = std::deque>; using C = std::flat_map, V, V>; static_assert(std::is_nothrow_destructible_v); + C c; } #if defined(_LIBCPP_VERSION) { using C = std::flat_map; static_assert(!std::is_nothrow_destructible_v); + C c; } #endif // _LIBCPP_VERSION diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp new file mode 100644 index 0000000000000..4fa4fd6a69b94 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// [[nodiscard]] bool empty() const noexcept; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m; + ASSERT_SAME_TYPE(decltype(m.empty()), bool); + ASSERT_NOEXCEPT(m.empty()); + assert(m.empty()); + assert(std::as_const(m).empty()); + m = {{1, 1.0}, {1, 2.0}}; + assert(!m.empty()); + m.clear(); + assert(m.empty()); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp new file mode 100644 index 0000000000000..9b7b827c9bec8 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// [[nodiscard]] bool empty() const noexcept; + +#include + +void f() { + std::flat_multimap c; + c.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp new file mode 100644 index 0000000000000..0960c43c5a90a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp @@ -0,0 +1,78 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// size_type max_size() const noexcept; + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_allocator.h" +#include "test_macros.h" + +int main(int, char**) { + { + using A1 = limited_allocator; + using A2 = limited_allocator; + using C = std::flat_multimap, std::vector, std::vector>; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= 10); + LIBCPP_ASSERT(c.max_size() == 10); + } + { + using A1 = limited_allocator; + using A2 = limited_allocator; + using C = std::flat_multimap, std::vector, std::vector>; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= 10); + LIBCPP_ASSERT(c.max_size() == 10); + } + { + using A = limited_allocator; + using C = std::flat_multimap, std::vector, std::vector>; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C::size_type max_dist = static_cast(std::numeric_limits::max()); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= max_dist); + LIBCPP_ASSERT(c.max_size() == max_dist); + } + { + typedef std::flat_multimap C; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C::size_type max_dist = static_cast(std::numeric_limits::max()); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= max_dist); + assert(c.max_size() <= alloc_max_size(std::allocator())); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp new file mode 100644 index 0000000000000..533f8da631fc8 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// size_type size() const noexcept; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + { + const M m = {{1, 'a'}, {1, 'b'}, {4, 'd'}, {5, 'e'}, {5, 'h'}}; + ASSERT_SAME_TYPE(decltype(m.size()), std::size_t); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 5); + } + { + const M m = {{1, 'a'}}; + ASSERT_SAME_TYPE(decltype(m.size()), std::size_t); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 1); + } + { + const M m; + ASSERT_SAME_TYPE(decltype(m.size()), std::size_t); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 0); + } + { + M m; + std::size_t s = 1000; + for (auto i = 0u; i < s; ++i) { + m.emplace(i, 'a'); + } + for (auto i = 0u; i < s; ++i) { + m.emplace(i, 'b'); + } + ASSERT_SAME_TYPE(decltype(m.size()), std::size_t); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 2 * s); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp new file mode 100644 index 0000000000000..3e155eb2a1075 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// explicit flat_multimap(const Allocator& a); + +#include +#include +#include +#include + +#include "test_macros.h" +#include "test_allocator.h" +#include "../../../test_compare.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // explicit + using M = + std::flat_multimap, + std::vector>, + std::vector>>; + + static_assert(std::is_constructible_v>); + static_assert(!std::is_convertible_v, M>); + } + { + using A = test_allocator; + using M = + std::flat_multimap, + std::vector>, + std::vector>>; + M m(A(0, 5)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.keys().get_allocator().get_id() == 5); + assert(m.values().get_allocator().get_id() == 5); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp new file mode 100644 index 0000000000000..32f75daae7e38 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap& operator=(initializer_list il); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + { + M m = {{8, 8}, {10, 10}}; + assert(m.size() == 2); + m = {{3, 0}, {1, 0}, {2, 0}, {2, 1}, {3, 1}, {4, 0}, {3, 2}, {5, 0}, {6, 0}, {5, 1}}; + std::pair expected[] = {{1, 0}, {2, 0}, {2, 1}, {3, 0}, {3, 1}, {3, 2}, {4, 0}, {5, 0}, {5, 1}, {6, 0}}; + assert(std::ranges::equal(m, expected)); + } + { + M m = {{10, 1}, {8, 1}}; + assert(m.size() == 2); + m = {{3, 2}}; + std::pair expected[] = {{3, 2}}; + assert(std::ranges::equal(m, expected)); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp new file mode 100644 index 0000000000000..1989b8a4ff68a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// explicit flat_multimap(const key_compare& comp); +// template +// flat_multimap(const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using M1 = std::flat_multimap, std::vector>; + using M2 = std::flat_multimap, std::vector>; + using M3 = std::flat_multimap, std::vector>; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + using C = test_less; + auto m = std::flat_multimap(C(3)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(3)); + } + { + // The one-argument ctor is explicit. + using C = test_less; + static_assert(std::is_constructible_v, C>); + static_assert(!std::is_convertible_v>); + + static_assert(std::is_constructible_v, std::less>); + static_assert(!std::is_convertible_v, std::flat_multimap>); + } + { + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + auto m = std::flat_multimap, std::vector>(C(4), A1(5)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(4)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // explicit(false) + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + std::flat_multimap, std::deque> m = {C(4), A1(5)}; + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(4)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // If an allocator is given, it must be usable by both containers. + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::vector>; + static_assert(std::is_constructible_v>); + static_assert(!std::is_constructible_v, std::allocator>); + static_assert(!std::is_constructible_v, A>); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp new file mode 100644 index 0000000000000..17ee3c3864b1b --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp @@ -0,0 +1,187 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(key_container_type key_cont, mapped_container_type mapped_cont, +// const key_compare& comp = key_compare()); +// template +// flat_multimap(const key_container_type& key_cont, const mapped_container_type& mapped_cont, +// const Allocator& a); +// template +// flat_multimap(const key_container_type& key_cont, const mapped_container_type& mapped_cont, +// const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "MoveOnly.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +struct P { + int first; + int second; + template + bool operator==(const std::pair& rhs) const { + return MoveOnly(first) == rhs.first && MoveOnly(second) == rhs.second; + } +}; + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // flat_multimap(key_container_type , mapped_container_type) + using M = std::flat_multimap; + std::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + std::vector vs = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto m = M(ks, vs); + std::pair expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + assert(std::ranges::equal(m, expected)); + + // explicit(false) + M m2 = {ks, vs}; + assert(m2 == m); + + m = M(std::move(ks), std::move(vs)); + assert(ks.empty()); // it was moved-from + assert(vs.empty()); // it was moved-from + assert(std::ranges::equal(m, expected)); + } + { + // flat_multimap(key_container_type , mapped_container_type) + // move-only + P expected[] = {{3, 3}, {3, 2}, {2, 1}, {1, 4}}; + using Ks = std::deque>; + using Vs = std::vector>; + using M = std::flat_multimap, Ks, Vs>; + Ks ks = {1, 3, 3, 2}; + Vs vs; + vs.push_back(4); + vs.push_back(3); + vs.push_back(2); + vs.push_back(1); + auto m = M(std::move(ks), std::move(vs)); + assert(ks.empty()); // it was moved-from + assert(vs.empty()); // it was moved-from + assert(std::ranges::equal(m, expected, std::equal_to<>())); + } + { + // flat_multimap(key_container_type , mapped_container_type) + // container's allocators are used + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto ks = std::vector({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + auto vs = std::deque({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6)); + auto m = M(std::move(ks), std::move(vs)); + assert(ks.empty()); // it was moved-from + assert(vs.empty()); // it was moved-from + std::pair expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}}; + assert(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A(5)); + assert(m.values().get_allocator() == A(6)); + } + { + // flat_multimap(key_container_type , mapped_container_type, key_compare) + using C = test_less; + using M = std::flat_multimap; + std::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + std::vector vs = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto m = M(ks, vs, C(4)); + std::pair expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(4)); + + // explicit(false) + M m2 = {ks, vs, C(4)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + } + { + // flat_multimap(key_container_type , mapped_container_type, const Allocator&) + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto ks = std::vector({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + auto vs = std::deque({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6)); + auto m = M(ks, vs, A(4)); // replaces the allocators + assert(!ks.empty()); // it was an lvalue above + assert(!vs.empty()); // it was an lvalue above + std::pair expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}}; + assert(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A(4)); + assert(m.values().get_allocator() == A(4)); + } + { + // flat_multimap(key_container_type , mapped_container_type, const Allocator&) + // explicit(false) + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto ks = std::vector({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + auto vs = std::deque({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6)); + M m = {ks, vs, A(4)}; // implicit ctor + assert(!ks.empty()); // it was an lvalue above + assert(!vs.empty()); // it was an lvalue above + std::pair expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}}; + assert(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A(4)); + assert(m.values().get_allocator() == A(4)); + } + { + // flat_multimap(key_container_type , mapped_container_type, key_compare, const Allocator&) + using C = test_less; + using A = test_allocator; + using M = std::flat_multimap, std::vector>; + std::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + std::vector vs = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto m = M(ks, vs, C(4), A(5)); + std::pair expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(4)); + assert(m.keys().get_allocator() == A(5)); + assert(m.values().get_allocator() == A(5)); + + // explicit(false) + M m2 = {ks, vs, C(4), A(5)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + assert(m2.keys().get_allocator() == A(5)); + assert(m2.values().get_allocator() == A(5)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp new file mode 100644 index 0000000000000..0e6d12cd3c569 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(const flat_multimap& m); + +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +int main(int, char**) { + { + using C = test_less; + std::vector> ks({1, 1, 3, 3, 5}, test_allocator(6)); + std::vector> vs({2, 2, 1, 1, 1}, test_allocator(7)); + using M = std::flat_multimap; + auto mo = M(ks, vs, C(5)); + auto m = mo; + + assert(m.key_comp() == C(5)); + assert(m.keys() == ks); + assert(m.values() == vs); + assert(m.keys().get_allocator() == test_allocator(6)); + assert(m.values().get_allocator() == test_allocator(7)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(mo.keys() == ks); + assert(mo.values() == vs); + assert(mo.keys().get_allocator() == test_allocator(6)); + assert(mo.values().get_allocator() == test_allocator(7)); + } + { + using C = test_less; + using Ks = std::vector>; + using Vs = std::vector>; + auto ks = Ks({1, 3, 5, 5, 5, 5}, other_allocator(6)); + auto vs = Vs({2, 2, 5, 5, 5, 1}, other_allocator(7)); + using M = std::flat_multimap; + auto mo = M(Ks(ks, other_allocator(6)), Vs(vs, other_allocator(7)), C(5)); + auto m = mo; + + assert(m.key_comp() == C(5)); + assert(m.keys() == ks); + assert(m.values() == vs); + assert(m.keys().get_allocator() == other_allocator(-2)); + assert(m.values().get_allocator() == other_allocator(-2)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(mo.keys() == ks); + assert(mo.values() == vs); + assert(mo.keys().get_allocator() == other_allocator(6)); + assert(mo.values().get_allocator() == other_allocator(7)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp new file mode 100644 index 0000000000000..3047c004d42e9 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(const flat_multimap&, const allocator_type&); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + using C = test_less; + std::vector> ks({1, 3, 3, 5, 5}, test_allocator(6)); + std::vector> vs({2, 2, 1, 1, 1}, test_allocator(7)); + using M = std::flat_multimap; + auto mo = M(ks, vs, C(5)); + auto m = M(mo, test_allocator(3)); + + assert(m.key_comp() == C(5)); + assert(m.keys() == ks); + assert(m.values() == vs); + assert(m.keys().get_allocator() == test_allocator(3)); + assert(m.values().get_allocator() == test_allocator(3)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(mo.keys() == ks); + assert(mo.values() == vs); + assert(mo.keys().get_allocator() == test_allocator(6)); + assert(mo.values().get_allocator() == test_allocator(7)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp new file mode 100644 index 0000000000000..233a9c6859318 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap& operator=(const flat_multimap& s); + +// Validate whether the container can be copy-assigned (move-assigned, swapped) +// with an ADL-hijacking operator& + +#include +#include + +#include "test_macros.h" +#include "operator_hijacker.h" + +void test() { + std::flat_multimap so; + std::flat_multimap s; + s = so; + s = std::move(so); + swap(s, so); +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp new file mode 100644 index 0000000000000..3dd7ebdd38871 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap& operator=(const flat_multimap& m); + +#include +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +int main(int, char**) { + { + // test_allocator is not propagated + using C = test_less; + std::vector> ks({1, 1, 3, 3, 5}, test_allocator(6)); + std::vector> vs({1, 2, 3, 4, 5}, test_allocator(7)); + using M = std::flat_multimap; + auto mo = M(ks, vs, C(5)); + auto m = M({{3, 3}, {4, 4}, {5, 5}}, C(3), test_allocator(2)); + m = mo; + + assert(m.key_comp() == C(5)); + assert(m.keys() == ks); + assert(m.values() == vs); + assert(m.keys().get_allocator() == test_allocator(2)); + assert(m.values().get_allocator() == test_allocator(2)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(mo.keys() == ks); + assert(mo.values() == vs); + assert(mo.keys().get_allocator() == test_allocator(6)); + assert(mo.values().get_allocator() == test_allocator(7)); + } + { + // other_allocator is propagated + using C = test_less; + using Ks = std::vector>; + using Vs = std::vector>; + auto ks = Ks({1, 1, 3, 3, 5}, other_allocator(6)); + auto vs = Vs({2, 1, 3, 2, 1}, other_allocator(7)); + using M = std::flat_multimap; + auto mo = M(Ks(ks, other_allocator(6)), Vs(vs, other_allocator(7)), C(5)); + auto m = M({{3, 3}, {4, 4}, {5, 5}}, C(3), other_allocator(2)); + m = mo; + + assert(m.key_comp() == C(5)); + assert(m.keys() == ks); + assert(m.values() == vs); + assert(m.keys().get_allocator() == other_allocator(6)); + assert(m.values().get_allocator() == other_allocator(7)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(mo.keys() == ks); + assert(mo.values() == vs); + assert(mo.keys().get_allocator() == other_allocator(6)); + assert(mo.values().get_allocator() == other_allocator(7)); + } + { + // self-assignment + using M = std::flat_multimap; + M m = {{1, 1}, {3, 4}}; + m = static_cast(m); + assert((m == M{{1, 1}, {3, 4}})); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp new file mode 100644 index 0000000000000..a9d8382bd037c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// Test CTAD on cases where deduction should fail. + +#include +#include +#include +#include +#include + +struct NotAnAllocator { + friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; } +}; + +using P = std::pair; +using PC = std::pair; + +template +concept CanDeductFlatMultimap = requires { std::flat_multimap{std::declval()...}; }; + +static_assert(CanDeductFlatMultimap, std::vector>); + +// cannot deduce Key and T from nothing +static_assert(!CanDeductFlatMultimap<>); + +// cannot deduce Key and T from just (KeyContainer), even if it's a container of pairs +static_assert(!CanDeductFlatMultimap>>); + +// cannot deduce Key and T from just (KeyContainer, Allocator) +static_assert(!CanDeductFlatMultimap, std::allocator>>); + +// cannot deduce Key and T from just (Compare) +static_assert(!CanDeductFlatMultimap>); + +// cannot deduce Key and T from just (Compare, Allocator) +static_assert(!CanDeductFlatMultimap, std::allocator>); + +// cannot deduce Key and T from just (Allocator) +static_assert(!CanDeductFlatMultimap>); + +// cannot convert from some arbitrary unrelated type +static_assert(!CanDeductFlatMultimap); diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp new file mode 100644 index 0000000000000..a718d9cfad5b7 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp @@ -0,0 +1,343 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "deduction_guides_sfinae_checks.h" +#include "test_allocator.h" + +using P = std::pair; +using PC = std::pair; + +void test_copy() { + { + std::flat_multimap source = {{1, 2}, {1, 3}}; + std::flat_multimap s(source); + ASSERT_SAME_TYPE(decltype(s), decltype(source)); + assert(s == source); + } + { + std::flat_multimap> source = {{1, 2}, {1, 3}}; + std::flat_multimap s{source}; // braces instead of parens + ASSERT_SAME_TYPE(decltype(s), decltype(source)); + assert(s == source); + } + { + std::flat_multimap> source = {{1, 2}, {1, 3}}; + std::flat_multimap s(source, std::allocator()); + ASSERT_SAME_TYPE(decltype(s), decltype(source)); + assert(s == source); + } +} + +void test_containers() { + std::deque> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator(0, 43)); + std::deque> sorted_ks({1, 1, 2, 2, 2, 3, INT_MAX}, test_allocator(0, 42)); + std::deque> sorted_vs({1, 3, 2, 4, 5, 4, 3}, test_allocator(0, 43)); + const std::pair expected[] = {{1, 1}, {1, 3}, {2, 2}, {2, 4}, {2, 5}, {3, 4}, {INT_MAX, 3}}; + { + std::flat_multimap s(ks, vs); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 42); + assert(s.values().get_allocator().get_id() == 43); + } + { + std::flat_multimap s(std::sorted_equivalent, sorted_ks, sorted_vs); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 42); + assert(s.values().get_allocator().get_id() == 43); + } + { + std::flat_multimap s(ks, vs, test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 44); + assert(s.values().get_allocator().get_id() == 44); + } + { + std::flat_multimap s(std::sorted_equivalent, sorted_ks, sorted_vs, test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 44); + assert(s.values().get_allocator().get_id() == 44); + } +} + +void test_containers_compare() { + std::deque> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator(0, 43)); + std::deque> sorted_ks({INT_MAX, 3, 2, 2, 2, 1, 1}, test_allocator(0, 42)); + std::deque> sorted_vs({3, 4, 2, 4, 5, 1, 3}, test_allocator(0, 43)); + const std::pair expected[] = {{INT_MAX, 3}, {3, 4}, {2, 2}, {2, 4}, {2, 5}, {1, 1}, {1, 3}}; + { + std::flat_multimap s(ks, vs, std::greater()); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 42); + assert(s.values().get_allocator().get_id() == 43); + } + { + std::flat_multimap s(std::sorted_equivalent, sorted_ks, sorted_vs, std::greater()); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 42); + assert(s.values().get_allocator().get_id() == 43); + } + { + std::flat_multimap s(ks, vs, std::greater(), test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 44); + assert(s.values().get_allocator().get_id() == 44); + } + { + std::flat_multimap s( + std::sorted_equivalent, sorted_ks, sorted_vs, std::greater(), test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap, decltype(ks), decltype(vs)>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 44); + assert(s.values().get_allocator().get_id() == 44); + } +} + +void test_iter_iter() { + const P arr[] = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}; + const P sorted_arr[] = {{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}}; + const PC arrc[] = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}; + const PC sorted_arrc[] = {{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}}; + { + std::flat_multimap m(std::begin(arr), std::end(arr)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::begin(arrc), std::end(arrc)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arr), std::end(sorted_arr)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arrc), std::end(sorted_arrc)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap mo; + std::flat_multimap m(mo.begin(), mo.end()); + ASSERT_SAME_TYPE(decltype(m), decltype(mo)); + } + { + std::flat_multimap mo; + std::flat_multimap m(mo.cbegin(), mo.cend()); + ASSERT_SAME_TYPE(decltype(m), decltype(mo)); + } + { + std::pair source[3] = {{1, 1}, {1, 1}, {3, 3}}; + std::flat_multimap s = {source, source + 3}; // flat_multimap(InputIterator, InputIterator) + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap); + assert(s.size() == 3); + } + { + std::pair source[3] = {{1, 1}, {1, 1}, {3, 3}}; + std::flat_multimap s{source, source + 3}; // flat_multimap(InputIterator, InputIterator) + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap); + assert(s.size() == 3); + } + { + std::pair source[3] = {{1, 1}, {1, 2}, {3, 3}}; + std::flat_multimap s{ + std::sorted_equivalent, source, source + 3}; // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator) + static_assert(std::is_same_v>); + assert(s.size() == 3); + } +} + +void test_iter_iter_compare() { + const P arr[] = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}; + const P sorted_arr[] = {{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}}; + const PC arrc[] = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}; + const PC sorted_arrc[] = {{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}}; + using C = std::greater; + { + std::flat_multimap m(std::begin(arr), std::end(arr), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::begin(arrc), std::end(arrc), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arr), std::end(sorted_arr), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arrc), std::end(sorted_arrc), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap mo; + std::flat_multimap m(mo.begin(), mo.end(), C()); + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + } + { + std::flat_multimap mo; + std::flat_multimap m(mo.cbegin(), mo.cend(), C()); + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + } +} + +void test_initializer_list() { + const P sorted_arr[] = {{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}}; + { + std::flat_multimap m{std::pair{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}; + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::sorted_equivalent, {std::pair{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}}); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap s = {std::make_pair(1, 'a')}; // flat_multimap(initializer_list>) + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap); + assert(s.size() == 1); + } + { + using M = std::flat_multimap; + M m; + std::flat_multimap s = {std::make_pair(m, m)}; // flat_multimap(initializer_list>) + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap); + assert(s.size() == 1); + assert(s.find(m)->second == m); + } +} + +void test_initializer_list_compare() { + const P sorted_arr[] = {{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}}; + using C = std::greater; + { + std::flat_multimap m({std::pair{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}, C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } + { + std::flat_multimap m(std::sorted_equivalent, {std::pair{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}}, C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multimap); + assert(std::ranges::equal(m, sorted_arr)); + } +} + +void test_from_range() { + std::list> r = {{1, 1}, {2, 2}, {1, 1}, {INT_MAX, 4}, {3, 5}}; + const std::pair expected[] = {{1, 1}, {1, 1}, {2, 2}, {3, 5}, {INT_MAX, 4}}; + { + std::flat_multimap s(std::from_range, r); + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap>); + assert(std::ranges::equal(s, expected)); + } + { + std::flat_multimap s(std::from_range, r, test_allocator(0, 42)); + ASSERT_SAME_TYPE( + decltype(s), + std::flat_multimap, + std::vector>, + std::vector>>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 42); + assert(s.values().get_allocator().get_id() == 42); + } +} + +void test_from_range_compare() { + std::list> r = {{1, 1}, {2, 2}, {1, 1}, {INT_MAX, 4}, {3, 5}}; + const std::pair expected[] = {{INT_MAX, 4}, {3, 5}, {2, 2}, {1, 1}, {1, 1}}; + { + std::flat_multimap s(std::from_range, r, std::greater()); + ASSERT_SAME_TYPE(decltype(s), std::flat_multimap>); + assert(std::ranges::equal(s, expected)); + } + { + std::flat_multimap s(std::from_range, r, std::greater(), test_allocator(0, 42)); + ASSERT_SAME_TYPE( + decltype(s), + std::flat_multimap, + std::vector>, + std::vector>>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().get_id() == 42); + assert(s.values().get_allocator().get_id() == 42); + } +} + +int main(int, char**) { + // Each test function also tests the sorted_equivalent-prefixed and allocator-suffixed overloads. + test_copy(); + test_containers(); + test_containers_compare(); + test_iter_iter(); + test_iter_iter_compare(); + test_initializer_list(); + test_initializer_list_compare(); + test_from_range(); + test_from_range_compare(); + + AssociativeContainerDeductionGuidesSfinaeAway>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp new file mode 100644 index 0000000000000..c25218e890f21 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// Test CTAD on cases where deduction should fail. + +#include +#include +#include + +struct NotAnAllocator { + friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; } +}; + +using P = std::pair; +using PC = std::pair; + +void test() { + { + // cannot deduce that the inner braced things should be std::pair and not something else + std::flat_multimap m{{1, 1L}, {2, 2L}, {3, 3L}}; + // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}} + } + { + // cannot deduce that the inner braced things should be std::pair and not something else + std::flat_multimap m({{1, 1L}, {2, 2L}, {3, 3L}}, std::less()); + // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}} + } + { + // cannot deduce that the inner braced things should be std::pair and not something else + std::flat_multimap m({{1, 1L}, {2, 2L}, {3, 3L}}, std::less(), std::allocator()); + // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}} + } + { + // cannot deduce that the inner braced things should be std::pair and not something else + std::flat_multimap m({{1, 1L}, {2, 2L}, {3, 3L}}, std::allocator()); + // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}} + } + { + // since we have parens, not braces, this deliberately does not find the initializer_list constructor + std::flat_multimap m(P{1, 1L}); + // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}} + } + { + // since we have parens, not braces, this deliberately does not find the initializer_list constructor + std::flat_multimap m(PC{1, 1L}); + // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}} + } +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp new file mode 100644 index 0000000000000..1955a8806631b --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp @@ -0,0 +1,107 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: availability-pmr-missing + +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_allocator.h" + +using P = std::pair; +using PC = std::pair; + +void test_containers() { + std::deque> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator(0, 43)); + std::deque> sorted_ks({1, 1, 2, 2, 2, 3, INT_MAX}, test_allocator(0, 42)); + std::deque> sorted_vs({1, 3, 2, 4, 5, 4, 3}, test_allocator(0, 43)); + const std::pair expected[] = {{1, 1}, {1, 3}, {2, 2}, {2, 4}, {2, 5}, {3, 4}, {INT_MAX, 3}}; + { + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(ks.begin(), ks.end(), &mr); + std::pmr::deque pvs(vs.begin(), vs.end(), &mr); + std::flat_multimap s(std::move(pks), std::move(pvs), &mr2); + + ASSERT_SAME_TYPE( + decltype(s), std::flat_multimap, std::pmr::deque, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().resource() == &mr2); + assert(s.values().get_allocator().resource() == &mr2); + } + { + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(sorted_ks.begin(), sorted_ks.end(), &mr); + std::pmr::deque pvs(sorted_vs.begin(), sorted_vs.end(), &mr); + std::flat_multimap s(std::sorted_equivalent, std::move(pks), std::move(pvs), &mr2); + + ASSERT_SAME_TYPE( + decltype(s), std::flat_multimap, std::pmr::deque, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().resource() == &mr2); + assert(s.values().get_allocator().resource() == &mr2); + } +} + +void test_containers_compare() { + std::deque> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator(0, 43)); + std::deque> sorted_ks({INT_MAX, 3, 2, 2, 2, 1, 1}, test_allocator(0, 42)); + std::deque> sorted_vs({3, 4, 2, 4, 5, 1, 3}, test_allocator(0, 43)); + const std::pair expected[] = {{INT_MAX, 3}, {3, 4}, {2, 2}, {2, 4}, {2, 5}, {1, 1}, {1, 3}}; + + { + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(ks.begin(), ks.end(), &mr); + std::pmr::deque pvs(vs.begin(), vs.end(), &mr); + std::flat_multimap s(std::move(pks), std::move(pvs), std::greater(), &mr2); + + ASSERT_SAME_TYPE( + decltype(s), std::flat_multimap, std::pmr::deque, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().resource() == &mr2); + assert(s.values().get_allocator().resource() == &mr2); + } + { + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(sorted_ks.begin(), sorted_ks.end(), &mr); + std::pmr::deque pvs(sorted_vs.begin(), sorted_vs.end(), &mr); + std::flat_multimap s(std::sorted_equivalent, std::move(pks), std::move(pvs), std::greater(), &mr2); + + ASSERT_SAME_TYPE( + decltype(s), std::flat_multimap, std::pmr::deque, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + assert(s.keys().get_allocator().resource() == &mr2); + assert(s.values().get_allocator().resource() == &mr2); + } +} + +int main(int, char**) { + test_containers(); + test_containers_compare(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp new file mode 100644 index 0000000000000..c910f748d95fe --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(); + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" + +struct DefaultCtableComp { + explicit DefaultCtableComp() { default_constructed_ = true; } + bool operator()(int, int) const { return false; } + bool default_constructed_ = false; +}; + +int main(int, char**) { + { + std::flat_multimap m; + assert(m.empty()); + } + { + // explicit(false) + std::flat_multimap m = {}; + assert(m.empty()); + } + { + std::flat_multimap>> m; + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp().default_constructed_); + } + { + using A1 = explicit_allocator; + using A2 = explicit_allocator; + { + std::flat_multimap, std::vector> m; + assert(m.empty()); + assert(m.key_comp().default_constructed_); + } + { + A1 a1; + std::flat_multimap, std::vector> m(a1); + assert(m.empty()); + assert(m.key_comp().default_constructed_); + } + } + { + // If an allocator is given, it must be usable by both containers. + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::vector>; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v>); + static_assert(!std::is_constructible_v); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp new file mode 100644 index 0000000000000..fa490f120875f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap() +// noexcept( +// is_nothrow_default_constructible_v && +// is_nothrow_default_constructible_v && +// is_nothrow_default_constructible_v); + +// This tests a conforming extension + +#include +#include +#include +#include + +#include "test_macros.h" +#include "MoveOnly.h" +#include "test_allocator.h" + +struct ThrowingCtorComp { + ThrowingCtorComp() noexcept(false) {} + bool operator()(const auto&, const auto&) const { return false; } +}; + +int main(int, char**) { +#if defined(_LIBCPP_VERSION) + { + using C = std::flat_multimap; + static_assert(std::is_nothrow_default_constructible_v); + C c; + } + { + using C = + std::flat_multimap, std::vector>>; + static_assert(std::is_nothrow_default_constructible_v); + C c; + } +#endif // _LIBCPP_VERSION + { + using C = + std::flat_multimap, std::vector>>; + static_assert(!std::is_nothrow_default_constructible_v); + C c; + } + { + using C = std::flat_multimap; + static_assert(!std::is_nothrow_default_constructible_v); + C c; + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp new file mode 100644 index 0000000000000..fd31e440a6614 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// ~flat_multimap(); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "MoveOnly.h" +#include "test_allocator.h" + +struct ThrowingDtorComp { + bool operator()(const auto&, const auto&) const; + ~ThrowingDtorComp() noexcept(false) {} +}; + +int main(int, char**) { + { + using C = std::flat_multimap; + static_assert(std::is_nothrow_destructible_v); + C c; + } + { + using V = std::vector>; + using C = std::flat_multimap, V, V>; + static_assert(std::is_nothrow_destructible_v); + C c; + } + { + using V = std::deque>; + using C = std::flat_multimap, V, V>; + static_assert(std::is_nothrow_destructible_v); + C c; + } +#if defined(_LIBCPP_VERSION) + { + using C = std::flat_multimap; + static_assert(!std::is_nothrow_destructible_v); + C c; + } +#endif // _LIBCPP_VERSION + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp new file mode 100644 index 0000000000000..8e89192ec0ea1 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp @@ -0,0 +1,159 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(initializer_list il, const key_compare& comp = key_compare()); +// template +// flat_multimap(initializer_list il, const Alloc& a); +// template +// flat_multimap(initializer_list il, const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" + +#include "../../../test_compare.h" + +struct DefaultCtableComp { + explicit DefaultCtableComp() { default_constructed_ = true; } + bool operator()(int, int) const { return false; } + bool default_constructed_ = false; +}; + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + using IL = std::initializer_list>; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + + { + // initializer_list needs to match exactly + using M = std::flat_multimap; + using C = typename M::key_compare; + static_assert(std::is_constructible_v>>); + static_assert(std::is_constructible_v>, C>); + static_assert(std::is_constructible_v>, C, std::allocator>); + static_assert(std::is_constructible_v>, std::allocator>); + static_assert(!std::is_constructible_v>>); + static_assert(!std::is_constructible_v>, C>); + static_assert( + !std::is_constructible_v>, C, std::allocator>); + static_assert(!std::is_constructible_v>, std::allocator>); + static_assert(!std::is_constructible_v>>); + static_assert(!std::is_constructible_v>, C>); + static_assert( + !std::is_constructible_v>, C, std::allocator>); + static_assert( + !std::is_constructible_v>, std::allocator>); + } + + std::pair expected[] = {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {5, 2}}; + { + // flat_multimap(initializer_list); + using M = std::flat_multimap; + std::initializer_list> il = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}; + M m(il); + assert(std::ranges::equal(m, expected)); + } + { + // flat_multimap(initializer_list); + // explicit(false) + using M = std::flat_multimap; + M m = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}; + assert(std::ranges::equal(m, expected)); + } + { + // flat_multimap(initializer_list); + using M = std::flat_multimap, std::deque>>; + M m = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}; + assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6)); + } + { + using A = explicit_allocator; + { + // flat_multimap(initializer_list); + // different comparator + using M = std::flat_multimap, std::deque>; + M m = {{1, 1}, {2, 2}, {3, 3}}; + assert(m.size() == 3); + + std::pair expected1[] = {{1, 1}, {2, 2}, {3, 3}}; + assert(std::ranges::equal(m, expected1)); + assert(m.key_comp().default_constructed_); + } + { + // flat_multimap(initializer_list, const Allocator&); + using M = std::flat_multimap, std::deque, std::vector>; + A a; + M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, a); + assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6)); + } + } + { + // flat_multimap(initializer_list, const key_compare&); + using C = test_less; + using M = std::flat_multimap; + auto m = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)); + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(10)); + + // explicit(false) + M m2 = {{{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)}; + assert(m2 == m); + assert(m2.key_comp() == C(10)); + } + { + // flat_multimap(initializer_list, const key_compare&); + // Sorting uses the comparator that was passed in + using M = std::flat_multimap, std::deque>>; + auto m = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, std::greater()); + assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6)); + assert(m.key_comp()(2, 1) == true); + } + { + // flat_multimap(initializer_list il, const key_compare& comp, const Alloc& a); + using A = explicit_allocator; + using M = std::flat_multimap, std::deque, std::vector>; + A a; + M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, {}, a); + assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp new file mode 100644 index 0000000000000..c9c5e6c99d1c8 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp @@ -0,0 +1,154 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// flat_multimap(InputIterator first, InputIterator last, const key_compare& comp = key_compare()); +// template +// flat_multimap(InputIterator first, InputIterator last, const Allocator& a); +// template +// flat_multimap(InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a); + +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + using Iter1 = typename M1::iterator; + using Iter2 = typename M2::iterator; + using Iter3 = typename M3::iterator; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}}; + P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + { + // flat_multimap(InputIterator , InputIterator) + // cpp17_input_iterator + using M = std::flat_multimap; + auto m = M(cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + + // explicit(false) + M m2 = {cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)}; + assert(m2 == m); + } + { + // flat_multimap(InputIterator , InputIterator) + // greater + using M = std::flat_multimap, std::deque>, std::deque>; + auto m = M(cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)); + assert((m.keys() == std::deque>{3, 3, 3, 2, 2, 2, 1, 1, 1})); + LIBCPP_ASSERT((m.values() == std::deque{6, 8, 9, 4, 5, 7, 1, 2, 3})); + } + { + // flat_multimap(InputIterator , InputIterator) + // Test when the operands are of array type (also contiguous iterator type) + using M = std::flat_multimap, std::vector>>; + auto m = M(ar, ar); + assert(m.empty()); + } + { + // flat_multimap(InputIterator , InputIterator, const key_compare&) + using C = test_less; + using M = std::flat_multimap, std::deque>; + auto m = M(ar, ar + 9, C(3)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + + // explicit(false) + M m2 = {ar, ar + 9, C(3)}; + assert(m2 == m); + assert(m2.key_comp() == C(3)); + } + { + // flat_multimap(InputIterator , InputIterator, const Allocator&) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto m = M(ar, ar + 9, A1(5)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(InputIterator , InputIterator, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + M m = {ar, ar + 9, A1(5)}; // implicit ctor + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(InputIterator , InputIterator, const key_compare&, const Allocator&) + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque>; + auto m = M(ar, ar + 9, C(3), A1(5)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(InputIterator , InputIterator, const key_compare&, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque, std::vector>; + M m = {ar, ar + 9, {}, A2(5)}; // implicit ctor + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp new file mode 100644 index 0000000000000..893c9247959d6 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(flat_multimap&&); + +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" +#include "min_allocator.h" + +int main(int, char**) { + { + using C = test_less; + using A = test_allocator; + using M = std::flat_multimap, std::deque>; + M mo = M({{1, 1}, {1, 2}, {3, 1}}, C(5), A(7)); + M m = std::move(mo); + assert((m == M{{1, 1}, {1, 2}, {3, 1}})); + assert(m.key_comp() == C(5)); + assert(m.keys().get_allocator() == A(7)); + assert(m.values().get_allocator() == A(7)); + + assert(mo.empty()); + assert(mo.key_comp() == C(5)); + assert(mo.keys().get_allocator().get_id() == test_alloc_base::moved_value); + assert(mo.values().get_allocator().get_id() == test_alloc_base::moved_value); + } + { + using C = test_less; + using A = min_allocator; + using M = std::flat_multimap, std::deque>; + M mo = M({{1, 1}, {1, 2}, {3, 1}}, C(5), A()); + M m = std::move(mo); + assert((m == M{{1, 1}, {1, 2}, {3, 1}})); + assert(m.key_comp() == C(5)); + assert(m.keys().get_allocator() == A()); + assert(m.values().get_allocator() == A()); + + assert(mo.empty()); + assert(mo.key_comp() == C(5)); + assert(m.keys().get_allocator() == A()); + assert(m.values().get_allocator() == A()); + } + { + // A moved-from flat_multimap maintains its class invariant in the presence of moved-from comparators. + using M = std::flat_multimap>; + M mo = M({{1, 1}, {1, 2}, {3, 1}}, std::less()); + M m = std::move(mo); + assert(m.size() == 3); + assert(std::is_sorted(m.begin(), m.end(), m.value_comp())); + assert(m.key_comp()(1, 2) == true); + + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + LIBCPP_ASSERT(m.key_comp()(1, 2) == true); + LIBCPP_ASSERT(mo.empty()); + mo.insert({{1, 1}, {1, 2}, {3, 1}}); // insert has no preconditions + assert(m == mo); + } + { + // moved-from object maintains invariant if one of underlying container does not clear after move + using M = std::flat_multimap, std::vector, CopyOnlyVector>; + M m1 = M({1, 1, 3}, {1, 2, 3}); + M m2 = std::move(m1); + assert(m2.size() == 3); + check_invariant(m1); + LIBCPP_ASSERT(m1.empty()); + LIBCPP_ASSERT(m1.keys().size() == 0); + LIBCPP_ASSERT(m1.values().size() == 0); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp new file mode 100644 index 0000000000000..a0259e805ac5a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(flat_multimap&&, const allocator_type&); + +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + std::pair expected[] = {{1, 1}, {1, 2}, {2, 3}, {2, 2}, {3, 1}}; + using C = test_less; + using A = test_allocator; + using M = std::flat_multimap, std::deque>; + auto mo = M(expected, expected + 5, C(5), A(7)); + auto m = M(std::move(mo), A(3)); + + assert(m.key_comp() == C(5)); + assert(m.size() == 5); + auto [keys, values] = std::move(m).extract(); + assert(keys.get_allocator() == A(3)); + assert(values.get_allocator() == A(3)); + assert(std::ranges::equal(keys, expected | std::views::elements<0>)); + assert(std::ranges::equal(values, expected | std::views::elements<1>)); + + // The original flat_multimap is moved-from. + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + assert(mo.empty()); + assert(mo.key_comp() == C(5)); + assert(mo.keys().get_allocator() == A(7)); + assert(mo.values().get_allocator() == A(7)); + } + { + // moved-from object maintains invariant if one of underlying container does not clear after move + using M = std::flat_multimap, std::vector, CopyOnlyVector>; + M m1 = M({1, 1, 3}, {1, 2, 3}); + M m2(std::move(m1), std::allocator{}); + assert(m2.size() == 3); + check_invariant(m1); + LIBCPP_ASSERT(m1.empty()); + LIBCPP_ASSERT(m1.keys().size() == 0); + LIBCPP_ASSERT(m1.values().size() == 0); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp new file mode 100644 index 0000000000000..38200d008c78a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap& operator=(flat_multimap&&); + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "MoveOnly.h" +#include "../../../test_compare.h" +#include "test_allocator.h" +#include "min_allocator.h" + +int main(int, char**) { + { + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector>; + M mo = M({{1, 1}, {1, 3}, {3, 2}}, C(5), A1(7)); + M m = M({}, C(3), A1(7)); + m = std::move(mo); + assert((m == M{{1, 1}, {1, 3}, {3, 2}})); + assert(m.key_comp() == C(5)); + auto [ks, vs] = std::move(m).extract(); + assert(ks.get_allocator() == A1(7)); + assert(vs.get_allocator() == A2(7)); + assert(mo.empty()); + } + { + using C = test_less; + using A1 = other_allocator; + using A2 = other_allocator; + using M = std::flat_multimap, std::deque>; + M mo = M({{4, 5}, {4, 4}}, C(5), A1(7)); + M m = M({{1, 1}, {1, 2}, {1, 3}, {4, 4}}, C(3), A1(7)); + m = std::move(mo); + assert((m == M{{4, 5}, {4, 4}})); + assert(m.key_comp() == C(5)); + auto [ks, vs] = std::move(m).extract(); + assert(ks.get_allocator() == A1(7)); + assert(vs.get_allocator() == A2(7)); + assert(mo.empty()); + } + { + using A = min_allocator; + using M = std::flat_multimap, std::vector, std::vector>; + M mo = M({{5, 1}, {5, 2}, {3, 3}}, A()); + M m = M({{4, 4}, {4, 3}, {4, 2}, {1, 1}}, A()); + m = std::move(mo); + assert((m == M{{5, 1}, {5, 2}, {3, 3}})); + auto [ks, vs] = std::move(m).extract(); + assert(ks.get_allocator() == A()); + assert(vs.get_allocator() == A()); + assert(mo.empty()); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp new file mode 100644 index 0000000000000..bc65dca32899c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp @@ -0,0 +1,101 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap& operator=(flat_multimap&&); +// Preserves the class invariant for the moved-from flat_multimap. + +#include +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" + +struct MoveNegates { + int value_ = 0; + MoveNegates() = default; + MoveNegates(int v) : value_(v) {} + MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; } + MoveNegates& operator=(MoveNegates&& rhs) { + value_ = rhs.value_; + rhs.value_ = -rhs.value_; + return *this; + } + ~MoveNegates() = default; + auto operator<=>(const MoveNegates&) const = default; +}; + +struct MoveClears { + int value_ = 0; + MoveClears() = default; + MoveClears(int v) : value_(v) {} + MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; } + MoveClears& operator=(MoveClears&& rhs) { + value_ = rhs.value_; + rhs.value_ = 0; + return *this; + } + ~MoveClears() = default; + auto operator<=>(const MoveClears&) const = default; +}; + +int main(int, char**) { + { + const std::pair expected[] = {{1, 1}, {1, 2}, {3, 3}, {3, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}}; + using M = std::flat_multimap, std::vector>; + M m = M(expected, expected + 8); + M m2 = M(expected, expected + 3); + + m2 = std::move(m); + + assert(std::equal(m2.begin(), m2.end(), expected, expected + 8)); + LIBCPP_ASSERT(m.empty()); + check_invariant(m); + m.insert({1, 1}); + m.insert({2, 2}); + assert(m.contains(1)); + assert(m.find(2) != m.end()); + } + { + const std::pair expected[] = {{1, 1}, {1, 2}, {3, 3}, {4, 4}, {5, 5}, {5, 6}, {7, 7}, {8, 8}}; + using M = std::flat_multimap, std::vector>; + M m = M(expected, expected + 8); + M m2 = M(expected, expected + 3); + + m2 = std::move(m); + + assert(std::equal(m2.begin(), m2.end(), expected, expected + 8)); + LIBCPP_ASSERT(m.empty()); + check_invariant(m); + m.insert({1, 1}); + m.insert({2, 2}); + assert(m.contains(1)); + assert(m.find(2) != m.end()); + } + { + // moved-from object maintains invariant if one of underlying container does not clear after move + using M = std::flat_multimap, std::vector, CopyOnlyVector>; + M m1 = M({1, 1, 3}, {1, 2, 3}); + M m2 = M({1, 1}, {1, 2}); + m2 = std::move(m1); + assert(m2.size() == 3); + check_invariant(m1); + LIBCPP_ASSERT(m1.empty()); + LIBCPP_ASSERT(m1.keys().size() == 0); + LIBCPP_ASSERT(m1.values().size() == 0); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp new file mode 100644 index 0000000000000..4eb58313f6f72 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap& operator=(flat_multimap&& c) +// noexcept( +// is_nothrow_move_assignable::value && +// is_nothrow_move_assignable::value && +// is_nothrow_copy_assignable::value); + +// This tests a conforming extension + +#include +#include +#include +#include +#include + +#include "MoveOnly.h" +#include "test_allocator.h" +#include "test_macros.h" + +struct MoveSensitiveComp { + MoveSensitiveComp() noexcept(false) = default; + MoveSensitiveComp(const MoveSensitiveComp&) noexcept(false) = default; + MoveSensitiveComp(MoveSensitiveComp&& rhs) { rhs.is_moved_from_ = true; } + MoveSensitiveComp& operator=(const MoveSensitiveComp&) noexcept = default; + MoveSensitiveComp& operator=(MoveSensitiveComp&& rhs) { + rhs.is_moved_from_ = true; + return *this; + } + bool operator()(const auto&, const auto&) const { return false; } + bool is_moved_from_ = false; +}; + +struct MoveThrowsComp { + MoveThrowsComp(MoveThrowsComp&&) noexcept(false); + MoveThrowsComp(const MoveThrowsComp&) noexcept(true); + MoveThrowsComp& operator=(MoveThrowsComp&&) noexcept(false); + MoveThrowsComp& operator=(const MoveThrowsComp&) noexcept(true); + bool operator()(const auto&, const auto&) const; +}; + +int main(int, char**) { + { + using C = std::flat_multimap; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v); + } + { + using C = + std::flat_multimap, + std::vector>, + std::vector>>; + static_assert(!std::is_nothrow_move_assignable_v); + } + { + using C = + std::flat_multimap, + std::vector>, + std::vector>>; + static_assert(!std::is_nothrow_move_assignable_v); + } + { + using C = + std::flat_multimap, + std::vector>, + std::vector>>; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v); + } + { + using C = + std::flat_multimap, + std::vector>, + std::vector>>; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v); + } + { + // Test with a comparator that throws on move-assignment. + using C = std::flat_multimap; + LIBCPP_STATIC_ASSERT(!std::is_nothrow_move_assignable_v); + } + { + // Test with a container that throws on move-assignment. + using C = std::flat_multimap, std::pmr::vector, std::vector>; + static_assert(!std::is_nothrow_move_assignable_v); + } + { + // Test with a container that throws on move-assignment. + using C = std::flat_multimap, std::vector, std::pmr::vector>; + static_assert(!std::is_nothrow_move_assignable_v); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp new file mode 100644 index 0000000000000..c2085e32be532 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: no-exceptions + +// + +// flat_multimap(flat_multimap&& s); +// If any member function in [flat.multimap.defn] exits via an exception, the invariant is restored. + +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" + +static int countdown = 0; + +struct EvilContainer : std::vector { + EvilContainer() = default; + EvilContainer(EvilContainer&& rhs) { + // Throw on move-construction. + if (--countdown == 0) { + rhs.insert(rhs.end(), 0); + rhs.insert(rhs.end(), 0); + throw 42; + } + } +}; + +int main(int, char**) { + { + using M = std::flat_multimap, EvilContainer, std::vector>; + M mo = {{1, 1}, {1, 2}, {3, 3}}; + countdown = 1; + try { + M m = std::move(mo); + assert(false); // not reached + } catch (int x) { + assert(x == 42); + } + // The source flat_multimap maintains its class invariant. + check_invariant(mo); + LIBCPP_ASSERT(mo.empty()); + } + { + using M = std::flat_multimap, std::vector, EvilContainer>; + M mo = {{1, 1}, {1, 2}, {3, 3}}; + countdown = 1; + try { + M m = std::move(mo); + assert(false); // not reached + } catch (int x) { + assert(x == 42); + } + // The source flat_multimap maintains its class invariant. + check_invariant(mo); + LIBCPP_ASSERT(mo.empty()); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp new file mode 100644 index 0000000000000..e038902e26d52 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp @@ -0,0 +1,104 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(flat_multimap&&) +// noexcept(is_nothrow_move_constructible::value && +// is_nothrow_move_constructible::value && +// is_nothrow_copy_constructible::value); + +// This tests a conforming extension + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "MoveOnly.h" +#include "test_allocator.h" + +template +struct ThrowingMoveAllocator { + using value_type = T; + explicit ThrowingMoveAllocator() = default; + ThrowingMoveAllocator(const ThrowingMoveAllocator&) = default; + ThrowingMoveAllocator(ThrowingMoveAllocator&&) noexcept(false) {} + T* allocate(std::ptrdiff_t n) { return std::allocator().allocate(n); } + void deallocate(T* p, std::ptrdiff_t n) { return std::allocator().deallocate(p, n); } + friend bool operator==(ThrowingMoveAllocator, ThrowingMoveAllocator) = default; +}; + +struct ThrowingMoveComp { + ThrowingMoveComp() = default; + ThrowingMoveComp(const ThrowingMoveComp&) noexcept(true) {} + ThrowingMoveComp(ThrowingMoveComp&&) noexcept(false) {} + bool operator()(const auto&, const auto&) const { return false; } +}; + +struct MoveSensitiveComp { + MoveSensitiveComp() noexcept(false) = default; + MoveSensitiveComp(const MoveSensitiveComp&) noexcept = default; + MoveSensitiveComp(MoveSensitiveComp&& rhs) { rhs.is_moved_from_ = true; } + MoveSensitiveComp& operator=(const MoveSensitiveComp&) noexcept(false) = default; + MoveSensitiveComp& operator=(MoveSensitiveComp&& rhs) { + rhs.is_moved_from_ = true; + return *this; + } + bool operator()(const auto&, const auto&) const { return false; } + bool is_moved_from_ = false; +}; + +int main(int, char**) { + { + using C = std::flat_multimap; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } + { + using C = std::flat_multimap, std::deque>>; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } +#if _LIBCPP_VERSION + { + // Container fails to be nothrow-move-constructible; this relies on libc++'s support for non-nothrow-copyable allocators + using C = + std::flat_multimap, std::deque>, std::vector>; + static_assert(!std::is_nothrow_move_constructible_v>>); + static_assert(!std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } + { + // Container fails to be nothrow-move-constructible; this relies on libc++'s support for non-nothrow-copyable allocators + using C = + std::flat_multimap, std::vector, std::deque>>; + static_assert(!std::is_nothrow_move_constructible_v>>); + static_assert(!std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } +#endif // _LIBCPP_VERSION + { + // Comparator fails to be nothrow-move-constructible + using C = std::flat_multimap; + static_assert(!std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp new file mode 100644 index 0000000000000..8b518f6afbda9 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp @@ -0,0 +1,361 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: availability-pmr-missing + +// + +// Test various constructors with pmr + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_iterators.h" +#include "test_macros.h" +#include "test_allocator.h" +#include "../../../test_compare.h" + +int main(int, char**) { + { + // flat_multimap(const Allocator& a); + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::polymorphic_allocator pa = &mr; + auto m1 = M(pa); + assert(m1.empty()); + assert(m1.keys().get_allocator() == pa); + assert(m1.values().get_allocator() == pa); + auto m2 = M(&mr); + assert(m2.empty()); + assert(m2.keys().get_allocator() == pa); + assert(m2.values().get_allocator() == pa); + } + { + // flat_multimap(const key_compare& comp, const Alloc& a); + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(std::greater()); + assert(vm[0] == M{}); + assert(vm[0].key_comp()(2, 1) == true); + assert(vm[0].value_comp()({2, 0}, {1, 0}) == true); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(const key_container_type& key_cont, const mapped_container_type& mapped_cont, + // const Allocator& a); + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pmr::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + std::pmr::vector vs = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + assert(ks.get_allocator().resource() != &mr); + assert(vs.get_allocator().resource() != &mr); + vm.emplace_back(ks, vs); + assert(ks.size() == 9); // ks' value is unchanged, since it was an lvalue above + assert(vs.size() == 9); // vs' value is unchanged, since it was an lvalue above + assert((vm[0] == M{{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}})); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(const flat_multimap&, const allocator_type&); + using C = test_less; + using M = std::flat_multimap, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({1, 2, 1}, {2, 2, 1}, C(5), &mr1); + M m = {mo, &mr2}; // also test the implicitness of this constructor + + assert(m.key_comp() == C(5)); + assert((m.keys() == std::pmr::vector{1, 1, 2})); + assert((m.values() == std::pmr::vector{2, 1, 2})); + assert(m.keys().get_allocator().resource() == &mr2); + assert(m.values().get_allocator().resource() == &mr2); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert((mo.keys() == std::pmr::vector{1, 1, 2})); + assert((mo.values() == std::pmr::vector{2, 1, 2})); + assert(mo.keys().get_allocator().resource() == &mr1); + assert(mo.values().get_allocator().resource() == &mr1); + } + { + // flat_multimap(const flat_multimap&, const allocator_type&); + using M = std::flat_multimap, std::pmr::vector, std::pmr::deque>; + std::pmr::vector vs; + M m = {{1, 2}, {1, 2}, {3, 1}}; + vs.push_back(m); + assert(vs[0] == m); + } + { + // flat_multimap& operator=(const flat_multimap& m); + // pmr allocator is not propagated + using M = std::flat_multimap, std::pmr::deque, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({{1, 1}, {1, 2}, {3, 3}}, &mr1); + M m = M({{4, 4}, {4, 5}}, &mr2); + m = mo; + assert((m == M{{1, 1}, {1, 2}, {3, 3}})); + assert(m.keys().get_allocator().resource() == &mr2); + assert(m.values().get_allocator().resource() == &mr2); + + // mo is unchanged + assert((mo == M{{1, 1}, {1, 2}, {3, 3}})); + assert(mo.keys().get_allocator().resource() == &mr1); + } + { + // flat_multimap(const flat_multimap& m); + using C = test_less; + std::pmr::monotonic_buffer_resource mr; + using M = std::flat_multimap, std::pmr::vector>; + auto mo = M({{1, 1}, {1, 2}, {3, 3}}, C(5), &mr); + auto m = mo; + + assert(m.key_comp() == C(5)); + assert((m == M{{1, 1}, {1, 2}, {3, 3}})); + auto [ks, vs] = std::move(m).extract(); + assert(ks.get_allocator().resource() == std::pmr::get_default_resource()); + assert(vs.get_allocator().resource() == std::pmr::get_default_resource()); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert((mo == M{{1, 1}, {1, 2}, {3, 3}})); + auto [kso, vso] = std::move(mo).extract(); + assert(kso.get_allocator().resource() == &mr); + assert(vso.get_allocator().resource() == &mr); + } + { + // flat_multimap(initializer_list il, const Alloc& a); + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::initializer_list il = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}}; + vm.emplace_back(il); + assert((vm[0] == M{{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}})); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(initializer_list il, const key_compare& comp, const Alloc& a); + using C = test_less; + using M = std::flat_multimap, std::pmr::deque>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::initializer_list il = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}}; + vm.emplace_back(il, C(5)); + assert((vm[0] == M{{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}})); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + assert(vm[0].key_comp() == C(5)); + } + { + // flat_multimap(InputIterator first, InputIterator last, const Allocator& a); + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}}; + P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + { + // cpp17 iterator + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)); + assert(std::ranges::equal(vm[0].keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(vm[0], expected)); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(ar, ar); + assert(vm[0].empty()); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + } + { + // flat_multimap(flat_multimap&&, const allocator_type&); + std::pair expected[] = {{1, 1}, {1, 1}, {2, 2}, {3, 1}}; + using C = test_less; + using M = std::flat_multimap, std::pmr::deque>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({{1, 1}, {3, 1}, {1, 1}, {2, 2}}, C(5), &mr1); + M m = {std::move(mo), &mr2}; // also test the implicitness of this constructor + + assert(m.key_comp() == C(5)); + assert(m.size() == 4); + assert(m.keys().get_allocator().resource() == &mr2); + assert(m.values().get_allocator().resource() == &mr2); + assert(std::ranges::equal(m, expected)); + + // The original flat_multimap is moved-from. + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + assert(mo.key_comp() == C(5)); + assert(mo.keys().get_allocator().resource() == &mr1); + assert(mo.values().get_allocator().resource() == &mr1); + } + { + // flat_multimap(flat_multimap&&, const allocator_type&); + using M = std::flat_multimap, std::pmr::deque, std::pmr::vector>; + std::pmr::vector vs; + M m = {{1, 1}, {3, 1}, {1, 1}, {2, 2}}; + vs.push_back(std::move(m)); + assert((vs[0].keys() == std::pmr::deque{1, 1, 2, 3})); + assert((vs[0].values() == std::pmr::vector{1, 1, 2, 1})); + } + { + // flat_multimap& operator=(flat_multimap&&); + using M = std:: + flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({{"short", 1}, + {"very long string that definitely won't fit in the SSO buffer and therefore becomes empty on move", 2}}, + &mr1); + M m = M({{"don't care", 3}}, &mr2); + m = std::move(mo); + assert(m.size() == 2); + assert(std::is_sorted(m.begin(), m.end(), m.value_comp())); + assert(m.begin()->first.get_allocator().resource() == &mr2); + + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + mo.insert({"foo", 1}); + assert(mo.begin()->first.get_allocator().resource() == &mr1); + } + { + // flat_multimap(from_range_t, R&&, const Alloc&); + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}}; + P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + { + // input_range + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); + assert(std::ranges::equal(vm[0].keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(vm[0], expected)); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + using R = std::ranges::subrange; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(std::from_range, R(ar, ar)); + assert(vm[0].empty()); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + } + { + // flat_multimap(sorted_equivalent_t, const key_container_type& key_cont, + // const mapped_container_type& mapped_cont, const Alloc& a); + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pmr::vector ks = {1, 1, 4, 10}; + std::pmr::vector vs = {4, 3, 2, 1}; + vm.emplace_back(std::sorted_equivalent, ks, vs); + assert(!ks.empty()); // it was an lvalue above + assert(!vs.empty()); // it was an lvalue above + assert((vm[0] == M{{1, 4}, {1, 3}, {4, 2}, {10, 1}})); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(sorted_equivalent_t, const key_container_type& key_cont, + // const mapped_container_type& mapped_cont, const Alloc& a); + using M = std::flat_multimap, std::pmr::vector, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pmr::vector ks({1, 1, 4, 10}, &mr); + std::pmr::vector vs({4, 3, 2, 1}, &mr); + vm.emplace_back(std::sorted_equivalent, ks, vs); + assert((vm[0] == M{{1, 4}, {1, 3}, {4, 2}, {10, 1}})); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list il, const Alloc& a); + // cpp_17 + using C = test_less; + using M = std::flat_multimap, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 4}, {5, 5}}; + vm.emplace_back( + std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 4), C(3)); + assert((vm[0] == M{{1, 1}, {1, 2}, {1, 4}, {5, 5}})); + assert(vm[0].key_comp() == C(3)); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list il, const Alloc& a); + using C = test_less; + using M = std::flat_multimap, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pair ar[1] = {{42, 42}}; + vm.emplace_back(std::sorted_equivalent, ar, ar, C(4)); + assert(vm[0] == M{}); + assert(vm[0].key_comp() == C(4)); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(InputIterator first, InputIterator last, const Alloc& a); + // cpp_17 + using C = test_less; + using M = std::flat_multimap, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 4}, {5, 5}}; + vm.emplace_back( + std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 4), C(3)); + assert((vm[0] == M{{1, 1}, {1, 2}, {1, 4}, {5, 5}})); + assert(vm[0].key_comp() == C(3)); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + { + // flat_multimap(InputIterator first, InputIterator last, const Alloc& a); + using C = test_less; + using M = std::flat_multimap, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pair ar[1] = {{42, 42}}; + vm.emplace_back(std::sorted_equivalent, ar, ar, C(4)); + assert(vm[0] == M{}); + assert(vm[0].key_comp() == C(4)); + assert(vm[0].keys().get_allocator().resource() == &mr); + assert(vm[0].values().get_allocator().resource() == &mr); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp new file mode 100644 index 0000000000000..de750e2506341 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp @@ -0,0 +1,227 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template R> +// flat_multimap(from_range_t, R&&) +// template R> +// flat_multimap(from_range_t, R&&, const key_compare&) +// template R, class Alloc> +// flat_multimap(from_range_t, R&&, const Alloc&); +// template R, class Alloc> +// flat_multimap(from_range_t, R&&, const key_compare&, const Alloc&); + +#include +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +// test constraint container-compatible-range + +template +using RangeOf = std::ranges::subrange; +using Map = std::flat_multimap; + +static_assert(std::is_constructible_v>>); +static_assert(std::is_constructible_v>>); +static_assert(!std::is_constructible_v>); +static_assert(!std::is_constructible_v>); + +static_assert(std::is_constructible_v>, std::less>); +static_assert(std::is_constructible_v>, std::less>); +static_assert(!std::is_constructible_v, std::less>); +static_assert(!std::is_constructible_v, std::less>); + +static_assert(std::is_constructible_v>, std::allocator>); +static_assert(std::is_constructible_v>, std::allocator>); +static_assert(!std::is_constructible_v, std::allocator>); +static_assert(!std::is_constructible_v, std::allocator>); + +static_assert(std::is_constructible_v>, + std::less, + std::allocator>); +static_assert(std::is_constructible_v>, + std::less, + std::allocator>); +static_assert(!std::is_constructible_v, std::less, std::allocator>); +static_assert(!std::is_constructible_v, std::less, std::allocator>); + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // container-compatible-range + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector>; + using Pair = std::pair; + using PairLike = std::tuple; + using NonPairLike = int; + + static_assert(std::is_constructible_v&>); + static_assert(std::is_constructible_v&>); + static_assert(!std::is_constructible_v&>); + + static_assert(std::is_constructible_v&, const C&>); + static_assert(std::is_constructible_v&, const C&>); + static_assert(!std::is_constructible_v&, const C&>); + + static_assert(std::is_constructible_v&, const A1&>); + static_assert(std::is_constructible_v&, const A1&>); + static_assert(!std::is_constructible_v&, const A1&>); + + static_assert(std::is_constructible_v&, const C&, const A1&>); + static_assert(std::is_constructible_v&, const C&, const A1&>); + static_assert(!std::is_constructible_v&, const C&, const A1&>); + } + + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}}; + P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}}; + { + // flat_multimap(from_range_t, R&&) + // input_range && !common + using M = std::flat_multimap; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + + // explicit(false) + M m2 = {std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))}; + assert(m2 == m); + } + { + // flat_multimap(from_range_t, R&&) + // greater + using M = std::flat_multimap, std::deque>, std::deque>; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); + assert((m.keys() == std::deque>{3, 3, 3, 2, 2, 2, 1, 1, 1})); + LIBCPP_ASSERT((m.values() == std::deque{6, 8, 9, 4, 5, 7, 1, 2, 3})); + } + { + // flat_multimap(from_range_t, R&&) + // contiguous range + using M = std::flat_multimap; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + } + { + // flat_multimap(from_range_t, R&&, const key_compare&) + using C = test_less; + using M = std::flat_multimap, std::deque>; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9), C(3)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + + // explicit(false) + M m2 = {std::from_range, R(ar, ar + 9), C(3)}; + assert(m2 == m); + assert(m2.key_comp() == C(3)); + } + { + // flat_multimap(from_range_t, R&&, const Allocator&) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9), A1(5)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(from_range_t, R&&, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + using R = std::ranges::subrange; + M m = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(from_range_t, R&&, const key_compare&, const Allocator&) + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque>; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9), C(3), A1(5)); + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(from_range_t, R&&, const key_compare&, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque, std::vector>; + using R = std::ranges::subrange; + M m = {std::from_range, R(ar, ar + 9), {}, A2(5)}; // implicit ctor + assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp new file mode 100644 index 0000000000000..16579f0deed5d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp @@ -0,0 +1,165 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap(sorted_equivalent_t, key_container_type key_cont, mapped_container_type mapped_cont, +// const key_compare& comp = key_compare()); +// +// template +// flat_multimap(sorted_equivalent_t, const key_container_type& key_cont, +// const mapped_container_type& mapped_cont, const Alloc& a); +// template +// flat_multimap(sorted_equivalent_t, const key_container_type& key_cont, +// const mapped_container_type& mapped_cont, +// const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include + +#include "min_allocator.h" +#include "MoveOnly.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type) + using M = std::flat_multimap; + std::vector ks = {1, 4, 4, 10}; + std::vector vs = {4, 3, 2, 1}; + auto ks2 = ks; + auto vs2 = vs; + + auto m = M(std::sorted_equivalent, ks, vs); + assert((m == M{{1, 4}, {4, 3}, {4, 2}, {10, 1}})); + m = M(std::sorted_equivalent, std::move(ks), std::move(vs)); + assert(ks.empty()); // it was moved-from + assert(vs.empty()); // it was moved-from + assert((m == M{{1, 4}, {4, 3}, {4, 2}, {10, 1}})); + + // explicit(false) + M m2 = {std::sorted_equivalent, std::move(ks2), std::move(vs2)}; + assert(m == m2); + } + { + // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type) + // non-default container, comparator and allocator type + using Ks = std::deque>; + using Vs = std::deque>; + using M = std::flat_multimap, Ks, Vs>; + Ks ks = {10, 1, 1, 1}; + Vs vs = {1, 2, 3, 4}; + auto m = M(std::sorted_equivalent, ks, vs); + assert((m == M{{1, 2}, {1, 3}, {1, 4}, {10, 1}})); + m = M(std::sorted_equivalent, std::move(ks), std::move(vs)); + assert(ks.empty()); // it was moved-from + assert(vs.empty()); // it was moved-from + assert((m == M{{1, 2}, {1, 3}, {1, 4}, {10, 1}})); + } + { + // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type) + // allocator copied into the containers + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto ks = std::vector({2, 2, 4, 10}, A(4)); + auto vs = std::deque({4, 3, 2, 1}, A(5)); + auto m = M(std::sorted_equivalent, std::move(ks), std::move(vs)); + assert(ks.empty()); // it was moved-from + assert(vs.empty()); // it was moved-from + assert((m == M{{2, 4}, {2, 3}, {4, 2}, {10, 1}})); + assert(m.keys().get_allocator() == A(4)); + assert(m.values().get_allocator() == A(5)); + } + { + // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, key_compare) + using C = test_less; + using M = std::flat_multimap; + std::vector ks = {1, 2, 10, 10}; + std::vector vs = {4, 3, 2, 1}; + + auto m = M(std::sorted_equivalent, ks, vs, C(4)); + assert((m == M{{1, 4}, {2, 3}, {10, 2}, {10, 1}})); + assert(m.key_comp() == C(4)); + + // explicit(false) + M m2 = {std::sorted_equivalent, ks, vs, C(4)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + } + { + // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, key_compare, const Allocator&) + using C = test_less; + using A = test_allocator; + using M = std::flat_multimap, std::vector>; + std::vector ks = {1, 2, 4, 10}; + std::vector vs = {4, 3, 2, 1}; + auto m = M(std::sorted_equivalent, ks, vs, C(4), A(5)); + assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}})); + assert(m.key_comp() == C(4)); + assert(m.keys().get_allocator() == A(5)); + assert(m.values().get_allocator() == A(5)); + + // explicit(false) + M m2 = {ks, vs, C(4), A(5)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + assert(m2.keys().get_allocator() == A(5)); + assert(m2.values().get_allocator() == A(5)); + } + { + // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, const Allocator&) + using A = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto ks = std::vector({1, 2, 4, 4}, A(4)); + auto vs = std::deque({4, 3, 2, 1}, A(5)); + auto m = M(std::sorted_equivalent, ks, vs, A(6)); // replaces the allocators + assert(!ks.empty()); // it was an lvalue above + assert(!vs.empty()); // it was an lvalue above + assert((m == M{{1, 4}, {2, 3}, {4, 2}, {4, 1}})); + assert(m.keys().get_allocator() == A(6)); + assert(m.values().get_allocator() == A(6)); + + // explicit(false) + M m2 = {std::sorted_equivalent, ks, vs, A(6)}; + assert(m2 == m); + assert(m2.keys().get_allocator() == A(6)); + assert(m2.values().get_allocator() == A(6)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp new file mode 100644 index 0000000000000..b34313bb3d404 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp @@ -0,0 +1,183 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// flat_multimap(sorted_equivalent_t s, initializer_list il, +// const key_compare& comp = key_compare()) +// template +// flat_multimap(sorted_equivalent_t, initializer_list il, const Alloc& a); +// template +// flat_multimap(sorted_equivalent_t, initializer_list il, +// const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +template +std::initializer_list> il = {{1, 1}, {4, 2}, {4, 4}, {5, 5}}; + +const auto il1 = il; +const auto il2 = il; +const auto il3 = il; + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + using IL = std::initializer_list>; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // initializer_list needs to match exactly + using M = std::flat_multimap; + using C = typename M::key_compare; + static_assert(std::is_constructible_v>>); + static_assert( + std::is_constructible_v>, C>); + static_assert(std::is_constructible_v>, + C, + std::allocator>); + static_assert(std::is_constructible_v>, + std::allocator>); + static_assert( + !std::is_constructible_v>>); + static_assert( + !std::is_constructible_v>, C>); + static_assert(!std::is_constructible_v>, + C, + std::allocator>); + static_assert(!std::is_constructible_v>, + std::allocator>); + static_assert( + !std:: + is_constructible_v>>); + static_assert(!std::is_constructible_v>, + C>); + static_assert(!std::is_constructible_v>, + C, + std::allocator>); + static_assert(!std::is_constructible_v>, + std::allocator>); + } + + { + // flat_multimap(sorted_equivalent_t, initializer_list); + using M = std::flat_multimap; + auto m = M(std::sorted_equivalent, il1); + auto expected = M{{1, 1}, {4, 2}, {4, 4}, {5, 5}}; + assert(m == expected); + + // explicit(false) + M m2 = {std::sorted_equivalent, il1}; + assert(m2 == m); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list, const key_compare&); + using M = std::flat_multimap>; + auto m = M(std::sorted_equivalent, il1, std::less()); + assert(m == M({{1, 1}, {4, 2}, {4, 4}, {5, 5}}, std::less<>())); + assert(m.key_comp()(1, 2) == true); + + // explicit(false) + M m2 = {std::sorted_equivalent, il1, std::less()}; + assert(m2 == m); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list, const key_compare&); + // greater + using M = std::flat_multimap, std::deque>, std::vector>; + std::initializer_list> il4{{5, 5}, {4, 4}, {1, 2}, {1, 1}}; + auto m = M(std::sorted_equivalent, il4, std::greater()); + assert((m == M{{5, 5}, {4, 4}, {1, 2}, {1, 1}})); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list, const Allocator&) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + auto m = M(std::sorted_equivalent, il2, A1(5)); + auto expected = M{{1, 1}, {4, 2}, {4, 4}, {5, 5}}; + assert(m == expected); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + + // explicit(false) + M m2 = {std::sorted_equivalent, il2, A1(5)}; + assert(m2 == m); + assert(m2.keys().get_allocator() == A1(5)); + assert(m2.values().get_allocator() == A2(5)); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list, const key_compare&, const Allocator&); + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque>; + auto m = M(std::sorted_equivalent, il2, C(3), A1(5)); + assert((m == M{{1, 1}, {4, 2}, {4, 4}, {5, 5}})); + assert(m.key_comp() == C(3)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(sorted_equivalent_t, initializer_list, const key_compare&, const Allocator&); + // explicit(false) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque, std::vector>; + M m = {std::sorted_equivalent, il3, {}, A1(5)}; // implicit ctor + assert((m == M{{1, 1}, {4, 2}, {4, 4}, {5, 5}})); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp new file mode 100644 index 0000000000000..45c4b3dc675a5 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// flat_multimap(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp = key_compare()); +// template +// flat_multimap(InputIterator first, InputIterator last, const Alloc& a); +// template +// flat_multimap(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a); + +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +int main(int, char**) { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + // and uses_allocator_v is true. + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multimap; + using M2 = std::flat_multimap; + using M3 = std::flat_multimap; + using Iter1 = typename M1::iterator; + using Iter2 = typename M2::iterator; + using Iter3 = typename M3::iterator; + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator); + // cpp17_input_iterator + using M = std::flat_multimap; + using P = std::pair; + P ar[] = {{1, 1}, {4, 4}, {5, 5}, {5, 2}}; + auto m = M(std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 4)); + auto expected = M{{1, 1}, {4, 4}, {5, 5}, {5, 2}}; + assert(m == expected); + + // explicit(false) + M m2 = {std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 4)}; + assert(m2 == m); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator); + // contiguous iterator + using C = test_less; + using M = + std::flat_multimap>, std::vector>>; + std::pair ar[] = {{1, 1}, {1, 4}, {2, 2}, {5, 5}}; + auto m = M(std::sorted_equivalent, ar, ar + 4); + auto expected = M{{1, 1}, {1, 4}, {2, 2}, {5, 5}}; + assert(m == expected); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); + // cpp_17_input_iterator + using M = std::flat_multimap>; + using P = std::pair; + P ar[] = {{1, 1}, {2, 2}, {2, 4}, {5, 5}}; + auto m = M(std::sorted_equivalent, + cpp17_input_iterator(ar), + cpp17_input_iterator(ar + 4), + std::less()); + assert(m == M({{1, 1}, {2, 2}, {2, 4}, {5, 5}}, std::less<>())); + assert(m.key_comp()(1, 2) == true); + + // explicit(false) + M m2 = {std::sorted_equivalent, + cpp17_input_iterator(ar), + cpp17_input_iterator(ar + 4), + std::less()}; + assert(m2 == m); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); + // greater + using M = std::flat_multimap, std::deque>, std::vector>; + using P = std::pair; + P ar[] = {{5, 5}, {2, 4}, {2, 2}, {1, 1}}; + auto m = M(std::sorted_equivalent, + cpp17_input_iterator(ar), + cpp17_input_iterator(ar + 4), + std::greater()); + assert((m == M{{5, 5}, {2, 4}, {2, 2}, {1, 1}})); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); + // contiguous iterator + using C = test_less; + using M = + std::flat_multimap>, std::vector>>; + std::pair ar[1] = {{42, 42}}; + auto m = M(std::sorted_equivalent, ar, ar, C(5)); + assert(m.empty()); + assert(m.key_comp() == C(5)); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::vector, std::deque>; + using P = std::pair; + P ar[] = {{2, 1}, {2, 2}, {4, 4}, {5, 5}}; + auto m = M(std::sorted_equivalent, ar, ar + 4, A1(5)); + auto expected = M{{2, 1}, {2, 2}, {4, 4}, {5, 5}}; + assert(m == expected); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + + // explicit(false) + M m2 = {std::sorted_equivalent, ar, ar + 4, A1(5)}; + assert(m2 == m); + assert(m2.keys().get_allocator() == A1(5)); + assert(m2.values().get_allocator() == A2(5)); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&); + using C = test_less; + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque>; + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 4}, {1, 5}}; + auto m = M(std::sorted_equivalent, ar, ar + 4, C(3), A1(5)); + assert((m == M{{1, 1}, {1, 2}, {1, 4}, {1, 5}})); + assert(m.key_comp() == C(3)); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + { + // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&); + // explicit(false) + using A1 = test_allocator; + using A2 = test_allocator; + using M = std::flat_multimap, std::deque, std::vector>; + using P = std::pair; + P ar[] = {{1, 1}, {1, 2}, {1, 4}, {1, 5}}; + M m = {std::sorted_equivalent, ar, ar + 4, {}, A1(5)}; // implicit ctor + assert((m == M{{1, 1}, {1, 2}, {1, 4}, {1, 5}})); + assert(m.keys().get_allocator() == A1(5)); + assert(m.values().get_allocator() == A2(5)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp new file mode 100644 index 0000000000000..76d5cbd909050 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template +// typename flat_multimap::size_type +// erase_if(flat_multimap& c, Predicate pred); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "test_allocator.h" +#include "min_allocator.h" + +// Verify that `flat_multimap` (like `multimap`) does NOT support std::erase. +// +template +concept HasStdErase = requires(S& s, typename S::value_type x) { std::erase(s, x); }; +static_assert(HasStdErase>); +static_assert(!HasStdErase>); + +template +M make(std::initializer_list vals) { + M ret; + for (int v : vals) { + ret.emplace(static_cast(v), static_cast(v + 10)); + } + return ret; +} + +template +void test0( + std::initializer_list vals, Pred p, std::initializer_list expected, std::size_t expected_erased_count) { + M s = make(vals); + ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p))); + assert(expected_erased_count == std::erase_if(s, p)); + assert(s == make(expected)); +} + +template +void test() { + // Test all the plausible signatures for this predicate. + auto is1 = [](typename S::const_reference v) { return v.first == 1; }; + auto is2 = [](typename S::value_type v) { return v.first == 2; }; + auto is3 = [](const typename S::value_type& v) { return v.first == 3; }; + auto is4 = [](auto v) { return v.first == 4; }; + auto True = [](const auto&) { return true; }; + auto False = [](auto&&) { return false; }; + + test0({}, is1, {}, 0); + + test0({1}, is1, {}, 1); + test0({1, 1}, is1, {}, 2); + test0({1, 1}, is2, {1, 1}, 0); + + test0({1, 2}, is1, {2}, 1); + test0({1, 2}, is2, {1}, 1); + test0({1, 2, 2, 2}, is2, {1}, 3); + test0({1, 2, 2, 2}, is3, {1, 2, 2, 2}, 0); + + test0({1, 1, 2, 2, 3, 3}, is1, {2, 2, 3, 3}, 2); + test0({1, 1, 2, 2, 3, 3}, is2, {1, 1, 3, 3}, 2); + test0({1, 1, 2, 2, 3, 3}, is3, {1, 1, 2, 2}, 2); + test0({1, 1, 2, 2, 3, 3}, is4, {1, 1, 2, 2, 3, 3}, 0); + + test0({1, 2, 2, 3, 3, 3}, True, {}, 6); + test0({1, 2, 2, 3, 3, 3}, False, {1, 2, 2, 3, 3, 3}, 0); +} + +int main(int, char**) { + test>(); + test, + std::vector>, + std::vector>>>(); + test, std::vector>>>(); + test, std::deque>>>(); + test, std::deque>>>(); + test>(); + test>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp new file mode 100644 index 0000000000000..13b57202f7862 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp @@ -0,0 +1,157 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: no-exceptions + +// + +// class flat_multimap + +// template +// typename flat_multimap::size_type +// erase_if(flat_multimap& c, Predicate pred); +// If any member function in [flat.set.defn] exits via an exception, the invariant is restored. +// (This is not a member function, but let's respect the invariant anyway.) + +#include +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" + +struct Counter { + int c1, c2, throws; + void tick() { + c1 -= 1; + if (c1 == 0) { + c1 = c2; + throws += 1; + throw 42; + } + } +}; +Counter g_counter = {0, 0, 0}; + +struct ThrowingAssignment { + ThrowingAssignment(int i) : i_(i) {} + ThrowingAssignment(const ThrowingAssignment&) = default; + ThrowingAssignment& operator=(const ThrowingAssignment& rhs) { + g_counter.tick(); + i_ = rhs.i_; + g_counter.tick(); + return *this; + } + operator int() const { return i_; } + int i_; +}; + +struct ThrowingComparator { + bool operator()(const ThrowingAssignment& a, const ThrowingAssignment& b) const { + g_counter.tick(); + return a.i_ < b.i_; + } +}; + +struct ErasurePredicate { + bool operator()(const auto& x) const { return (3 <= x.first && x.first <= 5); } +}; + +int main(int, char**) { + const std::pair expected[] = {{1, 1}, {2, 2}, {3, 3}, {3, 3}, {5, 5}, {6, 6}, {7, 7}, {8, 8}}; + { + using M = std::flat_multimap; + for (int first_throw = 1; first_throw < 99; ++first_throw) { + for (int second_throw = 1; second_throw < 99; ++second_throw) { + g_counter = {0, 0, 0}; + M m = M({1, 2, 3, 3, 5, 6, 7, 8}, {1, 2, 3, 3, 5, 6, 7, 8}); + try { + g_counter = {first_throw, second_throw, 0}; + auto n = std::erase_if(m, ErasurePredicate()); + assert(n == 3); + // If it didn't throw at all, we're done. + g_counter = {0, 0, 0}; + assert((m == M{{1, 1}, {2, 2}, {6, 6}, {7, 7}, {8, 8}})); + first_throw = 99; // "done" + break; + } catch (int ex) { + assert(ex == 42); + check_invariant(m); + LIBCPP_ASSERT(m.empty() || std::equal(m.begin(), m.end(), expected, expected + 8)); + if (g_counter.throws == 1) { + // We reached the first throw but not the second throw. + break; + } + } + } + } + } + { + using M = std::flat_multimap; + for (int first_throw = 1; first_throw < 99; ++first_throw) { + for (int second_throw = 1; second_throw < 99; ++second_throw) { + g_counter = {0, 0, 0}; + M m = M({1, 2, 3, 3, 5, 6, 7, 8}, {1, 2, 3, 3, 5, 6, 7, 8}); + try { + g_counter = {first_throw, second_throw, 0}; + auto n = std::erase_if(m, ErasurePredicate()); + assert(n == 3); + // If it didn't throw at all, we're done. + g_counter = {0, 0, 0}; + assert((m == M{{1, 1}, {2, 2}, {6, 6}, {7, 7}, {8, 8}})); + first_throw = 99; // "done" + break; + } catch (int ex) { + assert(ex == 42); + check_invariant(m); + LIBCPP_ASSERT(m.empty() || std::equal(m.begin(), m.end(), expected, expected + 8)); + if (g_counter.throws == 1) { + // We reached the first throw but not the second throw. + break; + } + } + } + } + } + { + using M = std:: + flat_multimap, std::deque>; + for (int first_throw = 1; first_throw < 99; ++first_throw) { + for (int second_throw = 1; second_throw < 99; ++second_throw) { + g_counter = {0, 0, 0}; + std::deque container = {5, 6, 7, 8}; + container.insert(container.begin(), {1, 2, 3, 3}); + M m = M(std::move(container), {1, 2, 3, 3, 5, 6, 7, 8}); + try { + g_counter = {first_throw, second_throw, 0}; + auto n = std::erase_if(m, ErasurePredicate()); + assert(n == 3); + // If it didn't throw at all, we're done. + g_counter = {0, 0, 0}; + assert((m == M{{1, 1}, {2, 2}, {6, 6}, {7, 7}, {8, 8}})); + first_throw = 99; // "done" + break; + } catch (int ex) { + assert(ex == 42); + check_invariant(m); + LIBCPP_ASSERT(m.empty() || std::equal(m.begin(), m.end(), expected, expected + 8)); + if (g_counter.throws == 1) { + // We reached the first throw but not the second throw. + break; + } + } + } + } + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp new file mode 100644 index 0000000000000..c1285955e5db6 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator begin() noexcept; +// const_iterator begin() const noexcept +// iterator end() noexcept; +// const_iterator end() const noexcept; +// +// const_iterator cbegin() const noexcept; +// const_iterator cend() const noexcept; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + M m = {{1, 'a'}, {1, 'z'}, {2, 'b'}, {3, 'a'}, {3, 'b'}, {3, 'c'}, {4, 'd'}}; + const M& cm = m; + ASSERT_SAME_TYPE(decltype(m.begin()), typename M::iterator); + ASSERT_SAME_TYPE(decltype(m.cbegin()), typename M::const_iterator); + ASSERT_SAME_TYPE(decltype(cm.begin()), typename M::const_iterator); + ASSERT_SAME_TYPE(decltype(m.end()), typename M::iterator); + ASSERT_SAME_TYPE(decltype(m.cend()), typename M::const_iterator); + ASSERT_SAME_TYPE(decltype(cm.end()), typename M::const_iterator); + static_assert(noexcept(m.begin())); + static_assert(noexcept(cm.begin())); + static_assert(noexcept(m.cbegin())); + static_assert(noexcept(m.end())); + static_assert(noexcept(cm.end())); + static_assert(noexcept(m.cend())); + assert(m.size() == 7); + assert(std::distance(m.begin(), m.end()) == 7); + assert(std::distance(cm.begin(), cm.end()) == 7); + assert(std::distance(m.cbegin(), m.cend()) == 7); + typename M::iterator i; // default-construct + i = m.begin(); // move-assignment + typename M::const_iterator k = i; // converting constructor + assert(i == k); // comparison + assert(i->first == 1); // operator-> + assert(i->second == 'a'); // operator-> + ++i; // pre-increment + assert(i->first == 1); // operator-> + assert(i->second == 'z'); // operator-> + i = i + 3; // operator+ + assert((*i).first == 3); // operator* + assert((*i).second == 'b'); // operator* + i += 3; // operator+= + assert(i == m.end()); // operator== + --i; // pre-decrement + assert(i->first == 4); // operator-> + assert(i->second == 'd'); // operator-> + i = i - 2; // operator- + assert(i->first == 3); // operator-> + assert(i->second == 'b'); // operator-> + i -= 2; // operator-= + assert(i > m.begin()); // operator> +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + // N3644 testing + using C = std::flat_multimap; + C::iterator ii1{}, ii2{}; + C::iterator ii4 = ii1; + C::const_iterator cii{}; + assert(ii1 == ii2); + assert(ii1 == ii4); + assert(!(ii1 != ii2)); + + assert((ii1 == cii)); + assert((cii == ii1)); + assert(!(ii1 != cii)); + assert(!(cii != ii1)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp new file mode 100644 index 0000000000000..f1b2cad743e23 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multimap iterators should be C++20 random access iterators + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using KI = typename KeyContainer::iterator; + using I = M::iterator; + using CI = M::const_iterator; + using RI = M::reverse_iterator; + using CRI = M::const_reverse_iterator; + + static_assert(std::equality_comparable); + static_assert(std::equality_comparable); + static_assert(std::equality_comparable); + static_assert(std::equality_comparable); + + static_assert(std::totally_ordered); + static_assert(std::totally_ordered); + static_assert(std::totally_ordered); + static_assert(std::totally_ordered); + + M m = {{1, 'a'}, {2, 'b'}, {2, 'e'}, {3, 'z'}, {3, 'y'}, {3, 'c'}, {4, 'd'}}; + + I i1 = m.begin(); + I i2 = m.begin() + 1; + + assert(i1 == i1); + assert(!(i1 != i1)); + assert(i1 != i2); + assert(!(i1 == i2)); + assert(i1 < i2); + assert(!(i1 < i1)); + assert(i1 <= i1); + assert(i1 <= i2); + assert(!(i2 <= i1)); + assert(i2 > i1); + assert(!(i2 > i2)); + assert(i2 >= i1); + assert(i2 >= i2); + assert(!(i1 >= i2)); + + CI ci1 = m.cbegin(); + CI ci2 = m.cbegin() + 1; + assert(ci1 == ci1); + assert(!(ci1 != ci1)); + assert(ci1 != ci2); + assert(!(ci1 == ci2)); + assert(ci1 < ci2); + assert(!(ci1 < ci1)); + assert(ci1 <= ci1); + assert(ci1 <= ci2); + assert(!(ci2 <= ci1)); + assert(ci2 > ci1); + assert(!(ci2 > ci2)); + assert(ci2 >= ci1); + assert(ci2 >= ci2); + assert(!(ci1 >= ci2)); + + RI ri1 = m.rbegin(); + RI ri2 = m.rbegin() + 1; + assert(ri1 == ri1); + assert(!(ri1 != ri1)); + assert(ri1 != ri2); + assert(!(ri1 == ri2)); + assert(ri1 < ri2); + assert(!(ri1 < ri1)); + assert(ri1 <= ri1); + assert(ri1 <= ri2); + assert(!(ri2 <= ri1)); + assert(ri2 > ri1); + assert(!(ri2 > ri2)); + assert(ri2 >= ri1); + assert(ri2 >= ri2); + assert(!(ri1 >= ri2)); + + CRI cri1 = m.crbegin(); + CRI cri2 = m.crbegin() + 1; + assert(cri1 == cri1); + assert(!(cri1 != cri1)); + assert(cri1 != cri2); + assert(!(cri1 == cri2)); + assert(cri1 < cri2); + assert(!(cri1 < cri1)); + assert(cri1 <= cri1); + assert(cri1 <= cri2); + assert(!(cri2 <= cri1)); + assert(cri2 > cri1); + assert(!(cri2 > cri2)); + assert(cri2 >= cri1); + assert(cri2 >= cri2); + assert(!(cri1 >= cri2)); + + if constexpr (std::three_way_comparable) { + static_assert(std::three_way_comparable); // ...of course the wrapped iterators still support <=>. + static_assert(std::three_way_comparable); + static_assert(std::three_way_comparable); + static_assert(std::three_way_comparable); + static_assert(std::same_as I()), std::strong_ordering>); + static_assert(std::same_as CI()), std::strong_ordering>); + static_assert(std::same_as CI()), std::strong_ordering>); + static_assert(std::same_as RI()), std::strong_ordering>); + static_assert(std::same_as CRI()), std::strong_ordering>); + static_assert(std::same_as CRI()), std::strong_ordering>); + + assert(i1 <=> i1 == std::strong_ordering::equivalent); + assert(i1 <=> i2 == std::strong_ordering::less); + assert(i2 <=> i1 == std::strong_ordering::greater); + + assert(ci1 <=> ci1 == std::strong_ordering::equivalent); + assert(ci1 <=> ci2 == std::strong_ordering::less); + assert(ci2 <=> ci1 == std::strong_ordering::greater); + + assert(ri1 <=> ri1 == std::strong_ordering::equivalent); + assert(ri1 <=> ri2 == std::strong_ordering::less); + assert(ri2 <=> ri1 == std::strong_ordering::greater); + + assert(cri1 <=> cri1 == std::strong_ordering::equivalent); + assert(cri1 <=> cri2 == std::strong_ordering::less); + assert(cri2 <=> cri1 == std::strong_ordering::greater); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp new file mode 100644 index 0000000000000..ce578e4def92b --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator, const_iterator, reverse_iterator, const_reverse_iterator + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using C = std::flat_multimap, KeyContainer, ValueContainer>; + using I = C::iterator; + using CI = C::const_iterator; + using RI = C::reverse_iterator; + using CRI = C::const_reverse_iterator; + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(!std::contiguous_iterator); + static_assert(!std::contiguous_iterator); + static_assert(!std::contiguous_iterator); + static_assert(!std::contiguous_iterator); + static_assert(!std::indirectly_writable>); + static_assert(!std::indirectly_writable>); + static_assert(!std::indirectly_writable>); + static_assert(!std::indirectly_writable>); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::indirectly_movable_storable*>); + static_assert(std::indirectly_movable_storable*>); + static_assert(std::indirectly_movable_storable*>); + static_assert(std::indirectly_movable_storable*>); + +#ifdef _LIBCPP_VERSION + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); +#endif +} + +void test() { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp new file mode 100644 index 0000000000000..979c0b090fd66 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include +#include +#include +#include +#include "MinSequenceContainer.h" +#include "min_allocator.h" + +template +void test() { + { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using C = std::flat_multimap, KeyContainer, ValueContainer>; + + static_assert(std::same_as, typename C::iterator>); + static_assert(std::ranges::random_access_range); + static_assert(!std::ranges::contiguous_range); + static_assert(std::ranges::common_range); + static_assert(std::ranges::input_range); + static_assert(!std::ranges::view); + static_assert(std::ranges::sized_range); + static_assert(!std::ranges::borrowed_range); + static_assert(std::ranges::viewable_range); + + static_assert(std::same_as, typename C::const_iterator>); + static_assert(std::ranges::random_access_range); + static_assert(!std::ranges::contiguous_range); + static_assert(std::ranges::common_range); + static_assert(std::ranges::input_range); + static_assert(!std::ranges::view); + static_assert(std::ranges::sized_range); + static_assert(!std::ranges::borrowed_range); + static_assert(!std::ranges::viewable_range); + } +} + +void test() { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp new file mode 100644 index 0000000000000..8c1e5451f703f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// reverse_iterator rbegin() noexcept; +// const_reverse_iterator rbegin() const noexcept; +// reverse_iterator rend() noexcept; +// const_reverse_iterator rend() const noexcept; +// +// const_reverse_iterator crbegin() const noexcept; +// const_reverse_iterator crend() const noexcept; + +#include +#include +#include +#include +#include +#include + +#include + +#include "test_macros.h" +#include + +int main(int, char**) { + { + using M = std::flat_multimap, std::deque, std::deque>; + M m = {{1, 'a'}, {1, 'b'}, {2, 'c'}, {2, 'd'}, {3, 'e'}, {3, 'f'}, {4, 'g'}, {4, 'h'}}; + const M& cm = m; + ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator); + static_assert(noexcept(m.rbegin())); + static_assert(noexcept(cm.rbegin())); + static_assert(noexcept(m.crbegin())); + static_assert(noexcept(m.rend())); + static_assert(noexcept(cm.rend())); + static_assert(noexcept(m.crend())); + assert(m.size() == 8); + assert(std::distance(m.rbegin(), m.rend()) == 8); + assert(std::distance(cm.rbegin(), cm.rend()) == 8); + assert(std::distance(m.crbegin(), m.crend()) == 8); + assert(std::distance(cm.crbegin(), cm.crend()) == 8); + M::reverse_iterator i; // default-construct + ASSERT_SAME_TYPE(decltype(i->first), const int&); + ASSERT_SAME_TYPE(decltype(i->second), char&); + i = m.rbegin(); // move-assignment + M::const_reverse_iterator k = i; // converting constructor + assert(i == k); // comparison + for (int j = 8; j >= 1; --j, ++i) { // pre-increment + assert(i->first == (j + 1) / 2); // operator-> + assert(i->second == 'a' + j - 1); + } + assert(i == m.rend()); + for (int j = 1; j <= 8; ++j) { + --i; // pre-decrement + assert((*i).first == (j + 1) / 2); + assert((*i).second == 'a' + j - 1); + } + assert(i == m.rbegin()); + } + { + // N3644 testing + using C = std::flat_multimap; + C::reverse_iterator ii1{}, ii2{}; + C::reverse_iterator ii4 = ii1; + C::const_reverse_iterator cii{}; + assert(ii1 == ii2); + assert(ii1 == ii4); + assert(!(ii1 != ii2)); + + assert((ii1 == cii)); + assert((cii == ii1)); + assert(!(ii1 != cii)); + assert(!(cii != ii1)); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp new file mode 100644 index 0000000000000..5b0788b6826fd --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// void clear() noexcept; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// test noexcept + +template +concept NoExceptClear = requires(T t) { + { t.clear() } noexcept; +}; + +static_assert(NoExceptClear>); +#ifndef TEST_HAS_NO_EXCEPTIONS +static_assert( + NoExceptClear, ThrowOnMoveContainer, ThrowOnMoveContainer>>); +#endif + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + M m = {{5, 2}, {2, 1}, {2, 3}, {2, 1}, {5, 0}}; + assert(m.size() == 5); + ASSERT_NOEXCEPT(m.clear()); + ASSERT_SAME_TYPE(decltype(m.clear()), void); + m.clear(); + assert(m.size() == 0); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp new file mode 100644 index 0000000000000..9ef0c26e54ba3 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp @@ -0,0 +1,158 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template +// iterator emplace(Args&&... args); + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "../../../Emplaceable.h" +#include "DefaultOnly.h" +#include "min_allocator.h" + +// Constraints: is_constructible_v, Args...> is true. +template +concept CanEmplace = requires(M m, Args&&... args) { m.emplace(std::forward(args)...); }; + +using Map = std::flat_multimap; +static_assert(CanEmplace); +static_assert(CanEmplace); +static_assert(CanEmplace, std::tuple>); +static_assert(!CanEmplace); +static_assert(!CanEmplace); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = typename M::iterator; + + { + // was empty + M m; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 3.5)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == 3.5); + } + { + // key does not exist and inserted at the begin + M m = {{3, 4.0}, {3, 3.0}, {3, 1.0}, {7, 0.0}}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 2.0)); + assert(r == m.begin()); + assert(m.size() == 5); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // key does not exist and inserted in the middle + M m = {{1, 4.0}, {1, 3.0}, {3, 1.0}, {4, 0.0}}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // key does not exist and inserted at the end + M m = {{1, 4.0}, {1, 3.0}}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 3); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // key already exists and original at the begin + M m = {{2, 4.0}, {2, 3.0}, {5, 1.0}, {6, 0.0}}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // key already exists and original in the middle + M m = {{0, 4.0}, {2, 3.0}, {2, 1.0}, {4, 0.0}}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 2.0)); + assert(r == m.begin() + 3); + assert(m.size() == 5); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // key already exists and original at the end + M m = {{0, 4.0}, {1, 3.0}, {2, 1.0}}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2, 2.0)); + assert(r == m.begin() + 3); + assert(m.size() == 4); + assert(r->first == 2); + assert(r->second == 2.0); + } +} + +template +void test_emplaceable() { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = typename M::iterator; + + M m; + std::same_as decltype(auto) r = + m.emplace(std::piecewise_construct, std::forward_as_tuple(2), std::forward_as_tuple()); + assert(r == m.begin()); + assert(m.size() == 1); + assert(m.begin()->first == 2); + assert(m.begin()->second == Emplaceable()); + r = m.emplace(std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.5)); + assert(r == m.begin()); + assert(m.size() == 2); + assert(m.begin()->first == 1); + assert(m.begin()->second == Emplaceable(2, 3.5)); + r = m.emplace(std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.5)); + assert(r == m.begin() + 1); + assert(m.size() == 3); + assert(m.begin()->first == 1); + assert(m.begin()->second == Emplaceable(2, 3.5)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + test_emplaceable, std::vector>(); + test_emplaceable, std::vector>(); + test_emplaceable, MinSequenceContainer>(); + test_emplaceable>, std::vector>>(); + + { + auto emplace_func = [](auto& m, auto key_arg, auto value_arg) { + m.emplace(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg)); + }; + test_emplace_exception_guarantee(emplace_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp new file mode 100644 index 0000000000000..588d27ea54f4d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp @@ -0,0 +1,228 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template +// iterator emplace_hint(const_iterator position, Args&&... args); + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "../../../Emplaceable.h" +#include "DefaultOnly.h" +#include "min_allocator.h" +#include "../helpers.h" + +#if defined(_LIBCPP_VERSION) +// spec only specifies `emplace(Args&&...)` is_constructible_v, Args...> is true. +// nothing mentioned for emplace_hint +template +concept CanEmplaceHint = + requires(M m, typename M::const_iterator i, Args&&... args) { m.emplace_hint(i, std::forward(args)...); }; + +using Map = std::flat_multimap; +static_assert(CanEmplaceHint); +static_assert(CanEmplaceHint); +static_assert(CanEmplaceHint, std::tuple>); +static_assert(!CanEmplaceHint); +static_assert(!CanEmplaceHint); +#endif + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = M::iterator; + { + // was empty + M m; + std::same_as decltype(auto) r = m.emplace_hint(m.end(), typename M::value_type(2, 3.5)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(m.begin()->first == 2); + assert(m.begin()->second == 3.5); + } + { + // hint correct and no duplicates + M m = {{0, 0.0}, {1, 1.0}, {3, 3.0}}; + auto it = m.begin() + 2; + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 4); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // hint correct and at the begin + M m = {{3, 3.0}, {4, 4.0}}; + auto it = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin()); + assert(m.size() == 3); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // hint correct and at the end + M m = {{0, 0.0}, {1, 1.0}}; + auto it = m.end(); + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 3); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // hint correct and at first duplicate + M m = {{0, 0.0}, {1, 1.0}, {2, 1.9}, {2, 2.1}, {3, 3.0}}; + auto it = m.begin() + 2; + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 6); + assert(r->first == 2); + assert(r->second == 2.0); + assert(std::next(r)->first == 2); + assert(std::next(r)->second == 1.9); + } + { + // hint correct and in-between duplicates + M m = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}}; + auto it = m.begin() + 4; + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 4); + assert(m.size() == 7); + assert(r->first == 2); + assert(r->second == 2.0); + assert(std::next(r)->first == 2); + assert(std::next(r)->second == 2.1); + } + { + // hint correct and after duplicates + M m = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}}; + auto it = m.begin() + 5; + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 5); + assert(m.size() == 7); + assert(r->first == 2); + assert(r->second == 2.0); + assert(std::next(r)->first == 3); + assert(std::next(r)->second == 3.0); + } + { + // hint incorrect and no duplicates + M m = {{0, 0.0}, {1, 1.0}, {3, 3.0}}; + auto it = m.begin() + 1; + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 4); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // hint incorrect and at the begin + M m = {{0, 0.0}, {1, 1.0}}; + auto it = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin() + 2); + assert(m.size() == 3); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // hint incorrect and at the end + M m = {{3, 3.0}, {4, 4.0}}; + auto it = m.end(); + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + assert(r == m.begin()); + assert(m.size() == 3); + assert(r->first == 2); + assert(r->second == 2.0); + } + { + // hint incorrect and before the first duplicate + M m = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}}; + auto it = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + // the result is as left as possible + assert(r == m.begin() + 2); + assert(m.size() == 7); + assert(r->first == 2); + assert(r->second == 2.0); + assert(std::next(r)->first == 2); + assert(std::next(r)->second == 1.8); + } + { + // hint incorrect and after the last duplicate + M m = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}, {4, 4.0}}; + auto it = m.begin() + 6; + std::same_as decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0)); + // the result is as right as possible + assert(r == m.begin() + 5); + assert(m.size() == 8); + assert(r->first == 2); + assert(r->second == 2.0); + assert(std::next(r)->first == 3); + assert(std::next(r)->second == 3.0); + } +} + +template +void test_emplaceable() { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = M::iterator; + + M m; + ASSERT_SAME_TYPE(decltype(m.emplace_hint(m.cbegin())), R); + R r = m.emplace_hint(m.end(), std::piecewise_construct, std::forward_as_tuple(2), std::forward_as_tuple()); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == Emplaceable()); + r = m.emplace_hint(m.end(), std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.5)); + assert(r == m.begin()); + assert(m.size() == 2); + assert(r->first == 1); + assert(r->second == Emplaceable(2, 3.5)); + r = m.emplace_hint(m.end(), std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.6)); + assert(r == m.begin() + 1); + assert(m.size() == 3); + assert(r->first == 1); + assert(r->second == Emplaceable(2, 3.6)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + test_emplaceable, std::vector>(); + test_emplaceable, std::vector>(); + test_emplaceable, MinSequenceContainer>(); + test_emplaceable>, std::vector>>(); + + { + auto emplace_func = [](auto& m, auto key_arg, auto value_arg) { + m.emplace_hint(m.begin(), std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg)); + }; + test_emplace_exception_guarantee(emplace_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp new file mode 100644 index 0000000000000..78040be2e043d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp @@ -0,0 +1,127 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator erase(iterator position); +// iterator erase(const_iterator position); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using P = std::pair; + using I = M::iterator; + + P ar[] = { + P(1, 1.5), + P(2, 2.5), + P(2, 2.6), + P(3, 3.5), + P(4, 4.5), + P(4, 4.5), + P(4, 4.7), + P(5, 5.5), + P(6, 6.5), + P(7, 7.5), + P(8, 8.5), + }; + M m(ar, ar + sizeof(ar) / sizeof(ar[0])); + assert(m.size() == 11); + std::same_as decltype(auto) i1 = m.erase(std::next(m.cbegin(), 2)); + assert(m.size() == 10); + assert(i1 == std::next(m.begin(), 2)); + assert(std::ranges::equal( + m, + std::vector

    { + {1, 1.5}, {2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}})); + + std::same_as decltype(auto) i2 = m.erase(std::next(m.begin(), 0)); + assert(m.size() == 9); + assert(i2 == m.begin()); + assert(std::ranges::equal( + m, std::vector

    {{2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}})); + + std::same_as decltype(auto) i3 = m.erase(std::next(m.cbegin(), 8)); + assert(m.size() == 8); + assert(i3 == m.end()); + assert(std::ranges::equal( + m, std::vector

    {{2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}})); + + std::same_as decltype(auto) i4 = m.erase(std::next(m.begin(), 1)); + assert(m.size() == 7); + assert(i4 == std::next(m.begin())); + assert(std::ranges::equal(m, std::vector

    {{2, 2.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}})); + + std::same_as decltype(auto) i5 = m.erase(std::next(m.cbegin(), 2)); + assert(m.size() == 6); + assert(i5 == std::next(m.begin(), 2)); + assert(std::ranges::equal(m, std::vector

    {{2, 2.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}})); + + std::same_as decltype(auto) i6 = m.erase(std::next(m.begin(), 2)); + assert(m.size() == 5); + assert(i6 == std::next(m.begin(), 2)); + assert(std::ranges::equal(m, std::vector

    {{2, 2.5}, {4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}})); + + std::same_as decltype(auto) i7 = m.erase(std::next(m.cbegin(), 0)); + assert(m.size() == 4); + assert(i7 == std::next(m.begin(), 0)); + assert(std::ranges::equal(m, std::vector

    {{4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}})); + + std::same_as decltype(auto) i8 = m.erase(std::next(m.cbegin(), 2)); + assert(m.size() == 3); + assert(i8 == std::next(m.begin(), 2)); + assert(std::ranges::equal(m, std::vector

    {{4, 4.5}, {5, 5.5}, {7, 7.5}})); + + std::same_as decltype(auto) i9 = m.erase(std::next(m.cbegin(), 2)); + assert(m.size() == 2); + assert(i9 == std::next(m.begin(), 2)); + assert(std::ranges::equal(m, std::vector

    {{4, 4.5}, {5, 5.5}})); + + std::same_as decltype(auto) i10 = m.erase(m.cbegin()); + assert(m.size() == 1); + assert(i10 == m.cbegin()); + assert(std::ranges::equal(m, std::vector

    {{5, 5.5}})); + + std::same_as decltype(auto) i11 = m.erase(m.begin()); + assert(m.size() == 0); + assert(i11 == m.begin()); + assert(i11 == m.end()); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto erase_function = [](auto& m, auto) { m.erase(m.begin() + 2); }; + test_erase_exception_guarantee(erase_function); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp new file mode 100644 index 0000000000000..103f38c1c5d4a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator erase(const_iterator first, const_iterator last); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using P = std::pair; + using I = M::iterator; + + P ar[] = { + P(1, 1.5), + P(2, 2.5), + P(2, 2.6), + P(3, 3.5), + P(3, 3.6), + P(3, 3.7), + P(4, 4.5), + P(5, 5.5), + P(6, 6.5), + P(7, 7.5), + P(8, 8.5), + }; + M m(ar, ar + sizeof(ar) / sizeof(ar[0])); + assert(m.size() == 11); + std::same_as decltype(auto) i1 = m.erase(m.cbegin(), m.cbegin()); + assert(m.size() == 11); + assert(i1 == m.begin()); + assert(std::ranges::equal( + m, + std::vector

    { + {1, 1.5}, + {2, 2.5}, + {2, 2.6}, + {3, 3.5}, + {3, 3.6}, + {3, 3.7}, + {4, 4.5}, + {5, 5.5}, + {6, 6.5}, + {7, 7.5}, + {8, 8.5}})); + + std::same_as decltype(auto) i2 = m.erase(m.cbegin(), std::next(m.cbegin(), 2)); + assert(m.size() == 9); + assert(i2 == m.begin()); + assert(std::ranges::equal( + m, std::vector

    {{2, 2.6}, {3, 3.5}, {3, 3.6}, {3, 3.7}, {4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}})); + + std::same_as decltype(auto) i3 = m.erase(std::next(m.cbegin(), 2), std::next(m.cbegin(), 6)); + assert(m.size() == 5); + assert(i3 == std::next(m.begin(), 2)); + assert(std::ranges::equal(m, std::vector

    {{2, 2.6}, {3, 3.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}})); + + std::same_as decltype(auto) i4 = m.erase(m.cbegin(), m.cend()); + assert(m.size() == 0); + assert(i4 == m.begin()); + assert(i4 == m.end()); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto erase_function = [](auto& m, auto) { m.erase(m.begin(), m.begin() + 2); }; + test_erase_exception_guarantee(erase_function); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp new file mode 100644 index 0000000000000..7944996fba1a0 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// size_type erase(const key_type& k); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template > +void test() { + using M = std::flat_multimap; + + auto make = [](std::initializer_list il) { + M m; + for (int i : il) { + m.emplace(i, i); + } + return m; + }; + M m = make({1, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 9}); + ASSERT_SAME_TYPE(decltype(m.erase(9)), typename M::size_type); + auto n = m.erase(10); + assert(n == 0); + assert(m == make({1, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 9})); + n = m.erase(4); + assert(n == 1); + assert(m == make({1, 1, 2, 2, 2, 3, 5, 5, 6, 7, 8, 8, 8, 8, 9})); + n = m.erase(1); + assert(n == 2); + assert(m == make({2, 2, 2, 3, 5, 5, 6, 7, 8, 8, 8, 8, 9})); + n = m.erase(8); + assert(n == 4); + assert(m == make({2, 2, 2, 3, 5, 5, 6, 7, 9})); + n = m.erase(3); + assert(n == 1); + assert(m == make({2, 2, 2, 5, 5, 6, 7, 9})); + n = m.erase(4); + assert(n == 0); + assert(m == make({2, 2, 2, 5, 5, 6, 7, 9})); + n = m.erase(6); + assert(n == 1); + assert(m == make({2, 2, 2, 5, 5, 7, 9})); + n = m.erase(7); + assert(n == 1); + assert(m == make({2, 2, 2, 5, 5, 9})); + n = m.erase(2); + assert(n == 3); + assert(m == make({5, 5, 9})); + n = m.erase(5); + assert(n == 2); + assert(m == make({9})); + n = m.erase(9); + assert(n == 1); + assert(m.empty()); + n = m.erase(1); + assert(n == 0); + assert(m.empty()); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector, std::greater<>>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto erase_function = [](auto& m, auto key_arg) { + using Map = std::decay_t; + using Key = typename Map::key_type; + const Key key{key_arg}; + m.erase(key); + }; + test_erase_exception_guarantee(erase_function); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp new file mode 100644 index 0000000000000..75a2d205b8f87 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp @@ -0,0 +1,161 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// size_type erase(K&& k); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanErase = requires(M m, Transparent k) { m.erase(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanErase); +static_assert(!CanErase); +static_assert(!CanErase); +static_assert(!CanErase); + +template +struct HeterogeneousKey { + explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {} + operator It() && { return it_; } + auto operator<=>(Key key) const { return key_ <=> key; } + friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) { + assert(false); + return false; + } + Key key_; + It it_; +}; + +template +void test_simple() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + M m = {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 4}, {3, 5}, {4, 4}}; + ASSERT_SAME_TYPE(decltype(m.erase(9)), typename M::size_type); + auto n = m.erase(3); // erase(K&&) [with K=int] + assert(n == 3); + assert((m == M{{1, 1}, {2, 2}, {2, 2}, {4, 4}})); + typename M::key_type lvalue = 2; + n = m.erase(lvalue); // erase(K&&) [with K=int&] + assert(n == 2); + assert((m == M{{1, 1}, {4, 4}})); + const typename M::key_type const_lvalue = 1; + n = m.erase(const_lvalue); // erase(const key_type&) + assert(n == 1); + assert((m == M{{4, 4}})); +} + +template +void test_transparent_comparator() { + using M = std::flat_multimap; + using P = std::pair; + M m = { + {"alpha", 1}, {"beta", 2}, {"epsilon", 3}, {"epsilon", 4}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}}; + ASSERT_SAME_TYPE(decltype(m.erase(Transparent{"abc"})), typename M::size_type); + + auto n = m.erase(Transparent{"epsilon"}); + assert(n == 2); + assert(std::ranges::equal( + m, std::vector

    {{"alpha", 1}, {"beta", 2}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}})); + + auto n2 = m.erase(Transparent{"aaa"}); + assert(n2 == 0); + assert(std::ranges::equal( + m, std::vector

    {{"alpha", 1}, {"beta", 2}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}})); + + auto n3 = m.erase(Transparent{"gamma"}); + assert(n3 == 3); + assert(std::ranges::equal(m, std::vector

    {{"alpha", 1}, {"beta", 2}, {"eta", 4}})); + + auto n4 = m.erase(Transparent{"alpha"}); + assert(n4 == 1); + assert(std::ranges::equal(m, std::vector

    {{"beta", 2}, {"eta", 4}})); + + auto n5 = m.erase(Transparent{"alpha"}); + assert(n5 == 0); + assert(std::ranges::equal(m, std::vector

    {{"beta", 2}, {"eta", 4}})); + + auto n6 = m.erase(Transparent{"beta"}); + assert(n6 == 1); + assert(std::ranges::equal(m, std::vector

    {{"eta", 4}})); + + auto n7 = m.erase(Transparent{"eta"}); + assert(n7 == 1); + assert(std::ranges::equal(m, std::vector

    {})); + + auto n8 = m.erase(Transparent{"eta"}); + assert(n8 == 0); + assert(std::ranges::equal(m, std::vector

    {})); +} + +int main(int, char**) { + test_simple, std::vector>(); + test_simple, std::vector>(); + test_simple, MinSequenceContainer>(); + test_simple>, std::vector>>(); + + test_transparent_comparator, std::vector>(); + test_transparent_comparator, std::vector>(); + test_transparent_comparator, MinSequenceContainer>(); + test_transparent_comparator>, + std::vector>>(); + + { + // P2077's HeterogeneousKey example + using M = std::flat_multimap>; + M m = {{1, 1}, {2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {6, 6}, {7, 7}, {8, 8}, {8, 8}}; + auto h1 = HeterogeneousKey(8, m.begin()); + std::same_as auto n = m.erase(h1); // lvalue is not convertible to It; erase(K&&) is the best match + assert(n == 2); + assert((m == M{{1, 1}, {2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {6, 6}, {7, 7}})); + std::same_as auto it = m.erase(std::move(h1)); // rvalue is convertible to It; erase(K&&) drops out + assert(it == m.begin()); + assert((m == M{{2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {6, 6}, {7, 7}})); + } + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 3}, {3, 3}}, c); + assert(!transparent_used); + auto n = m.erase(Transparent{3}); + assert(n == 2); + assert(transparent_used); + } + { + auto erase_transparent = [](auto& m, auto key_arg) { + using Map = std::decay_t; + using Key = typename Map::key_type; + m.erase(Transparent{key_arg}); + }; + test_erase_exception_guarantee(erase_transparent); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp new file mode 100644 index 0000000000000..f5ed4a9663a9d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// containers extract() &&; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +concept CanExtract = requires(T&& t) { std::forward(t).extract(); }; + +static_assert(CanExtract&&>); +static_assert(!CanExtract&>); +static_assert(!CanExtract const&>); +static_assert(!CanExtract const&&>); + +template +void test() { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = M({1, 2, 2, 2, 3, 3}, {4, 5, 6, 7, 8, 9}); + + std::same_as auto containers = std::move(m).extract(); + + auto expected_keys = {1, 2, 2, 2, 3, 3}; + auto expected_values = {4, 5, 6, 7, 8, 9}; + assert(std::ranges::equal(containers.keys, expected_keys)); + assert(std::ranges::equal(containers.values, expected_values)); + check_invariant(m); + LIBCPP_ASSERT(m.empty()); + LIBCPP_ASSERT(m.keys().size() == 0); + LIBCPP_ASSERT(m.values().size() == 0); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + { + // extracted object maintains invariant if one of underlying container does not clear after move + using M = std::flat_multimap, std::vector, CopyOnlyVector>; + M m = M({1, 2, 2, 2, 3, 3}, {1, 2, 3, 4, 5, 6}); + std::same_as auto containers = std::move(m).extract(); + assert(containers.keys.size() == 6); + assert(containers.values.size() == 6); + check_invariant(m); + LIBCPP_ASSERT(m.empty()); + LIBCPP_ASSERT(m.keys().size() == 0); + LIBCPP_ASSERT(m.values().size() == 0); + } + + { +#ifndef TEST_HAS_NO_EXCEPTIONS + using KeyContainer = std::vector; + using ValueContainer = ThrowOnMoveContainer; + using M = std::flat_multimap; + + M m; + m.emplace(1, 1); + m.emplace(1, 1); + try { + auto c = std::move(m).extract(); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we try to erase the key after value emplacement failure. + // and after erasure failure, we clear the flat_multimap + LIBCPP_ASSERT(m.size() == 0); + } +#endif + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp new file mode 100644 index 0000000000000..88c173d8a6917 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator insert(const value_type& v); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "../helpers.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = typename M::iterator; + using VT = typename M::value_type; + M m; + + const VT v1(2, 2.5); + std::same_as decltype(auto) r = m.insert(v1); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == 2.5); + + const VT v2(1, 1.5); + r = m.insert(v2); + assert(r == m.begin()); + assert(m.size() == 2); + assert(r->first == 1); + assert(r->second == 1.5); + + const VT v3(3, 3.5); + r = m.insert(v3); + assert(r == m.begin() + 2); + assert(m.size() == 3); + assert(r->first == 3); + assert(r->second == 3.5); + + const VT v4(3, 4.5); + r = m.insert(v4); + assert(r == m.begin() + 3); + assert(m.size() == 4); + assert(r->first == 3); + assert(r->second == 4.5); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, auto key_arg, auto value_arg) { + using FlatMap = std::decay_t; + using value_type = typename FlatMap::value_type; + const value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg)); + m.insert(p); + }; + test_emplace_exception_guarantee(insert_func); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp new file mode 100644 index 0000000000000..098b66cc49f18 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// void insert(initializer_list il); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using V = std::pair; + + M m = {{1, 1}, {1, 1.5}, {1, 2}, {3, 1}, {3, 1.5}, {3, 2}}; + m.insert({ + {4, 1}, + {4, 1.5}, + {4, 2}, + {1, 1}, + {1, 1.5}, + {1, 2}, + {2, 1}, + {2, 1.5}, + {2, 2}, + }); + assert(m.size() == 15); + std::vector expected = { + {1, 1}, + {1, 1.5}, + {1, 2}, + {1, 1}, + {1, 1.5}, + {1, 2}, + {2, 1}, + {2, 1.5}, + {2, 2}, + {3, 1}, + {3, 1.5}, + {3, 2}, + {4, 1}, + {4, 1.5}, + {4, 2}, + }; + assert(std::ranges::equal(m, expected)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, const auto& newValues) { + using FlatMap = std::decay_t; + using value_type = typename FlatMap::value_type; + std::initializer_list il = {{newValues[0].first, newValues[0].second}}; + m.insert(il); + }; + test_insert_range_exception_guarantee(insert_func); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp new file mode 100644 index 0000000000000..9d645043a15ca --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator insert(const_iterator position, const value_type& v); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "../helpers.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = typename M::iterator; + using VT = typename M::value_type; + + M m; + const VT v1(2, 2.5); + std::same_as decltype(auto) r = m.insert(m.end(), v1); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == 2.5); + + const VT v2(1, 1.5); + r = m.insert(m.end(), v2); + assert(r == m.begin()); + assert(m.size() == 2); + assert(r->first == 1); + assert(r->second == 1.5); + + const VT v3(3, 3.5); + r = m.insert(m.end(), v3); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(r->first == 3); + assert(r->second == 3.5); + + const VT v4(3, 4.5); + r = m.insert(m.end(), v4); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(r->first == 3); + assert(r->second == 4.5); + + const VT v5(2, 5.5); + r = m.insert(m.end(), v5); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(r->first == 2); + assert(r->second == 5.5); + + const VT v6(2, 6.5); + r = m.insert(m.begin(), v6); + assert(r == m.begin() + 1); + assert(m.size() == 6); + assert(r->first == 2); + assert(r->second == 6.5); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, auto key_arg, auto value_arg) { + using FlatMap = std::decay_t; + using value_type = typename FlatMap::value_type; + const value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg)); + m.insert(m.begin(), p); + }; + test_emplace_exception_guarantee(insert_func); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp new file mode 100644 index 0000000000000..ae031bd010f76 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp @@ -0,0 +1,109 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template +// void insert(InputIterator first, InputIterator last); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// test constraint InputIterator +template +concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward(args)...); }; + +using Map = std::flat_multimap; +using Pair = std::pair; + +static_assert(CanInsert); +static_assert(CanInsert, cpp17_input_iterator>); +static_assert(!CanInsert); +static_assert(!CanInsert, cpp20_input_iterator>); + +template +void test() { + using P = std::pair; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + P ar1[] = { + P(2, 1), + P(2, 1.5), + P(2, 2), + P(1, 1), + P(1, 1.5), + P(1, 2), + P(3, 1), + P(3, 1.5), + P(3, 2), + }; + P ar2[] = { + P(4, 1), + P(4, 1.5), + P(4, 2), + P(1, 1), + P(1, 1.5), + P(1, 2), + P(0, 1), + P(0, 1.5), + P(0, 2), + }; + + M m; + m.insert(cpp17_input_iterator(ar1), cpp17_input_iterator(ar1 + sizeof(ar1) / sizeof(ar1[0]))); + assert(m.size() == 9); + std::vector

    expected{{1, 1}, {1, 1.5}, {1, 2}, {2, 1}, {2, 1.5}, {2, 2}, {3, 1}, {3, 1.5}, {3, 2}}; + assert(std::ranges::equal(m, expected)); + + m.insert(cpp17_input_iterator(ar2), cpp17_input_iterator(ar2 + sizeof(ar2) / sizeof(ar2[0]))); + assert(m.size() == 18); + std::vector

    expected2{ + {0, 1}, + {0, 1.5}, + {0, 2}, + {1, 1}, + {1, 1.5}, + {1, 2}, + {1, 1}, + {1, 1.5}, + {1, 2}, + {2, 1}, + {2, 1.5}, + {2, 2}, + {3, 1}, + {3, 1.5}, + {3, 2}, + {4, 1}, + {4, 1.5}, + {4, 2}}; + assert(std::ranges::equal(m, expected2)); +} +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, const auto& newValues) { m.insert(newValues.begin(), newValues.end()); }; + test_insert_range_exception_guarantee(insert_func); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp new file mode 100644 index 0000000000000..61962f4873aee --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp @@ -0,0 +1,103 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator insert(const_iterator position, value_type&&); + +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "../helpers.h" +#include "test_macros.h" + +template +void do_insert_iter_rv_test() { + using M = Container; + using P = Pair; + using R = typename M::iterator; + M m; + std::same_as decltype(auto) r = m.insert(m.end(), P(2, 2)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == 2); + + r = m.insert(m.end(), P(1, 1)); + assert(r == m.begin()); + assert(m.size() == 2); + assert(r->first == 1); + assert(r->second == 1); + + r = m.insert(m.end(), P(3, 3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(r->first == 3); + assert(r->second == 3); + + r = m.insert(m.end(), P(3, 4)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(r->first == 3); + assert(r->second == 4); + + r = m.insert(m.end(), P(2, 5)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(r->first == 2); + assert(r->second == 5); + + r = m.insert(m.begin(), P(2, 6)); + assert(r == m.begin() + 1); + assert(m.size() == 6); + assert(r->first == 2); + assert(r->second == 6); +} + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using P = std::pair; + using CP = std::pair; + + do_insert_iter_rv_test(); + do_insert_iter_rv_test(); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, std::deque>(); + test, std::deque>(); + test, MinSequenceContainer>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, auto key_arg, auto value_arg) { + using FlatMap = std::decay_t; + using value_type = typename FlatMap::value_type; + value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg)); + m.insert(m.begin(), std::move(p)); + }; + test_emplace_exception_guarantee(insert_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp new file mode 100644 index 0000000000000..97b8f17d1094f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp @@ -0,0 +1,101 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template R> +// void insert_range(R&& rg); + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "MoveOnly.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// test constraint container-compatible-range +template +concept CanInsertRange = requires(M m, R&& r) { m.insert_range(std::forward(r)); }; + +using Map = std::flat_multimap; + +static_assert(CanInsertRange*>>); +static_assert(CanInsertRange*>>); +static_assert(!CanInsertRange>); +static_assert(!CanInsertRange>); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + + { + using P = std::pair; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using It = forward_iterator; + M m = {{10, 1}, {8, 2}, {5, 3}, {2, 4}, {1, 5}}; + P ar[] = {{3, 1}, {1, 2}, {4, 3}, {1, 4}, {5, 5}, {9, 6}}; + std::ranges::subrange r = {It(ar), It(ar + 6)}; + static_assert(std::ranges::common_range); + m.insert_range(r); + std::vector

    expected = {{1, 5}, {1, 2}, {1, 4}, {2, 4}, {3, 1}, {4, 3}, {5, 3}, {5, 5}, {8, 2}, {9, 6}, {10, 1}}; + assert(std::ranges::equal(m, expected)); + } + { + using P = std::pair; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using It = cpp20_input_iterator; + M m = {{8, 1}, {5, 2}, {3, 3}, {2, 4}}; + P ar[] = {{3, 1}, {1, 2}, {4, 3}, {1, 4}, {5, 5}, {9, 6}}; + std::ranges::subrange r = {It(ar), sentinel_wrapper(It(ar + 6))}; + static_assert(!std::ranges::common_range); + m.insert_range(r); + std::vector

    expected = {{9, 6}, {8, 1}, {5, 2}, {5, 5}, {4, 3}, {3, 3}, {3, 1}, {2, 4}, {1, 2}, {1, 4}}; + assert(std::ranges::equal(m, expected)); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + { + // Items are forwarded correctly from the input range (P2767). + std::pair a[] = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}}; + std::flat_multimap m; + m.insert_range(a | std::views::as_rvalue); + std::pair expected[] = {{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}}; + assert(std::ranges::equal(m, expected)); + } + { + // The element type of the range doesn't need to be std::pair (P2767). + std::pair pa[] = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}}; + std::deque>> a(pa, pa + 5); + std::flat_multimap m; + m.insert_range(a); + std::pair expected[] = {{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}}; + assert(std::ranges::equal(m, expected)); + } + { + auto insert_func = [](auto& m, const auto& newValues) { m.insert_range(newValues); }; + test_insert_range_exception_guarantee(insert_func); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp new file mode 100644 index 0000000000000..573150248ca48 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp @@ -0,0 +1,116 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator insert( value_type&& v); + +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "test_macros.h" +#include "../helpers.h" + +template +void do_insert_rv_test() { + using M = Container; + using P = Pair; + using R = typename M::iterator; + M m; + std::same_as decltype(auto) r = m.insert(P(2, 2)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == 2); + + r = m.insert(P(1, 1)); + assert(r == m.begin()); + assert(m.size() == 2); + assert(r->first == 1); + assert(r->second == 1); + + r = m.insert(P(3, 3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(r->first == 3); + assert(r->second == 3); + + r = m.insert(P(3, 3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(r->first == 3); + assert(r->second == 3); +} + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + using P = std::pair; + using CP = std::pair; + + do_insert_rv_test(); + do_insert_rv_test(); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + using M = std::flat_multimap; + using R = M::iterator; + M m; + R r = m.insert({2, MoveOnly(2)}); + assert(r == m.begin()); + assert(m.size() == 1); + assert(r->first == 2); + assert(r->second == 2); + + r = m.insert({1, MoveOnly(1)}); + assert(r == m.begin()); + assert(m.size() == 2); + assert(r->first == 1); + assert(r->second == 1); + + r = m.insert({3, MoveOnly(3)}); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(r->first == 3); + assert(r->second == 3); + + r = m.insert({3, MoveOnly(3)}); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(r->first == 3); + assert(r->second == 3); + } + { + auto insert_func = [](auto& m, auto key_arg, auto value_arg) { + using FlatMap = std::decay_t; + using value_type = typename FlatMap::value_type; + value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg)); + m.insert(std::move(p)); + }; + test_emplace_exception_guarantee(insert_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp new file mode 100644 index 0000000000000..334dff0a0d2f6 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// void insert(sorted_equivalent_t, initializer_list il); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + using V = std::pair; + M m = {{1, 1}, {1, 1.5}, {1, 2}, {3, 1}, {3, 1.5}, {3, 2}}; + m.insert(std::sorted_equivalent, + { + {0, 1}, + {1, 2}, + {1, 3}, + {2, 1}, + {2, 4}, + {4, 1}, + }); + assert(m.size() == 12); + V expected[] = {{0, 1}, {1, 1}, {1, 1.5}, {1, 2}, {1, 2}, {1, 3}, {2, 1}, {2, 4}, {3, 1}, {3, 1.5}, {3, 2}, {4, 1}}; + assert(std::ranges::equal(m, expected)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, const auto& newValues) { + using FlatMap = std::decay_t; + using value_type = typename FlatMap::value_type; + std::initializer_list il = {{newValues[0].first, newValues[0].second}}; + m.insert(std::sorted_equivalent, il); + }; + test_insert_range_exception_guarantee(insert_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp new file mode 100644 index 0000000000000..37808470a2cf7 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template +// void insert(sorted_equivalent_t, InputIterator first, InputIterator last); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// test constraint InputIterator +template +concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward(args)...); }; + +using Map = std::flat_multimap; +using Pair = std::pair; + +static_assert(CanInsert); +static_assert(CanInsert, cpp17_input_iterator>); +static_assert(!CanInsert); +static_assert(!CanInsert, cpp20_input_iterator>); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using P = std::pair; + + P ar1[] = { + P(1, 1), + P(1, 0), + P(2, 1), + P(2, 3), + P(3, 1), + }; + + P ar2[] = { + P(0, 1), + P(2, 2), + P(2, 5), + P(4, 1), + P(4, 4), + }; + + M m; + m.insert(std::sorted_equivalent, + cpp17_input_iterator(ar1), + cpp17_input_iterator(ar1 + sizeof(ar1) / sizeof(ar1[0]))); + assert(m.size() == 5); + P expected[] = {{1, 1}, {1, 0}, {2, 1}, {2, 3}, {3, 1}}; + assert(std::ranges::equal(m, expected)); + + m.insert(std::sorted_equivalent, + cpp17_input_iterator(ar2), + cpp17_input_iterator(ar2 + sizeof(ar2) / sizeof(ar2[0]))); + assert(m.size() == 10); + P expected2[] = {{0, 1}, {1, 1}, {1, 0}, {2, 1}, {2, 3}, {2, 2}, {2, 5}, {3, 1}, {4, 1}, {4, 4}}; + assert(std::ranges::equal(m, expected2)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + auto insert_func = [](auto& m, const auto& newValues) { + m.insert(std::sorted_equivalent, newValues.begin(), newValues.end()); + }; + test_insert_range_exception_guarantee(insert_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp new file mode 100644 index 0000000000000..33ca4d4e30469 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp @@ -0,0 +1,135 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template iterator insert(P&& x); +// template iterator insert(const_iterator hint, P&& x); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// Constraints: is_constructible_v, P> is true. +template +concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward(args)...); }; + +using Map = std::flat_multimap; +using Iter = Map::const_iterator; + +static_assert(CanInsert&&>); +static_assert(CanInsert&&>); +static_assert(CanInsert&&>); +static_assert(CanInsert&&>); +static_assert(!CanInsert); +static_assert(!CanInsert); + +static int expensive_comparisons = 0; +static int cheap_comparisons = 0; + +struct CompareCounter { + int i_ = 0; + CompareCounter(int i) : i_(i) {} + friend auto operator<=>(const CompareCounter& x, const CompareCounter& y) { + expensive_comparisons += 1; + return x.i_ <=> y.i_; + } + bool operator==(const CompareCounter&) const = default; + friend auto operator<=>(const CompareCounter& x, int y) { + cheap_comparisons += 1; + return x.i_ <=> y; + } +}; + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + { + // insert(P&&) + // Unlike flat_set, here we can't use key_compare to compare value_type versus P, + // so we must eagerly convert to value_type. + M m = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {4, 4}, {5, 5}}; + expensive_comparisons = 0; + cheap_comparisons = 0; + std::same_as decltype(auto) r = m.insert(std::make_pair(3, 3)); // conversion happens first + assert(expensive_comparisons >= 2); + assert(cheap_comparisons == 0); + assert(r == m.begin() + 4); + + std::pair expected[] = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {3, 3}, {4, 4}, {5, 5}}; + assert(std::ranges::equal(m, expected)); + } + { + // insert(const_iterator, P&&) + M m = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {4, 4}, {5, 5}}; + expensive_comparisons = 0; + cheap_comparisons = 0; + std::same_as auto it = m.insert(m.begin(), std::make_pair(3, 3)); + assert(expensive_comparisons >= 2); + assert(cheap_comparisons == 0); + assert(it == m.begin() + 2); + std::pair expected[] = {{1, 1}, {2, 2}, {3, 3}, {3, 1}, {3, 4}, {4, 4}, {5, 5}}; + assert(std::ranges::equal(m, expected)); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + // no ambiguity between insert(pos, P&&) and insert(first, last) + using M = std::flat_multimap; + struct Evil { + operator M::value_type() const; + operator M::const_iterator() const; + }; + std::flat_multimap m; + ASSERT_SAME_TYPE(decltype(m.insert(Evil())), M::iterator); + ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), Evil())), M::iterator); + ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), m.end())), void); + } + { + auto insert_func = [](auto& m, auto key_arg, auto value_arg) { + using FlatMap = std::decay_t; + using tuple_type = std::tuple; + tuple_type t(key_arg, value_arg); + m.insert(t); + }; + test_emplace_exception_guarantee(insert_func); + } + { + auto insert_func_iter = [](auto& m, auto key_arg, auto value_arg) { + using FlatMap = std::decay_t; + using tuple_type = std::tuple; + tuple_type t(key_arg, value_arg); + m.insert(m.begin(), t); + }; + test_emplace_exception_guarantee(insert_func_iter); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp new file mode 100644 index 0000000000000..86fbaff468ab6 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// void replace(key_container_type&& key_cont, mapped_container_type&& mapped_cont); + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +concept CanReplace = requires(T t, Args&&... args) { t.replace(std::forward(args)...); }; + +using Map = std::flat_multimap; +static_assert(CanReplace, std::vector>); +static_assert(!CanReplace&, std::vector>); +static_assert(!CanReplace, const std::vector&>); +static_assert(!CanReplace&, const std::vector&>); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + M m = M({1, 1, 3}, {4, 5, 6}); + KeyContainer new_keys = {7, 7}; + ValueContainer new_values = {9, 10}; + auto expected_keys = new_keys; + auto expected_values = new_values; + m.replace(std::move(new_keys), std::move(new_values)); + assert(m.size() == 2); + assert(std::ranges::equal(m.keys(), expected_keys)); + assert(std::ranges::equal(m.values(), expected_values)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { +#ifndef TEST_HAS_NO_EXCEPTIONS + using KeyContainer = std::vector; + using ValueContainer = ThrowOnMoveContainer; + using M = std::flat_multimap; + + M m; + m.emplace(1, 1); + m.emplace(2, 2); + try { + KeyContainer new_keys{3, 4}; + ValueContainer new_values{5, 6}; + m.replace(std::move(new_keys), std::move(new_values)); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear the map + LIBCPP_ASSERT(m.size() == 0); + } +#endif + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp new file mode 100644 index 0000000000000..a1252f301309a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// `check_assertion.h` requires Unix headers and regex support. +// REQUIRES: has-unix-headers +// UNSUPPORTED: no-localization +// UNSUPPORTED: no-exceptions + +// + +// class flat_multimap + +// void swap(flat_multimap& y) noexcept; +// friend void swap(flat_multimap& x, flat_multimap& y) noexcept + +// Test that std::terminate is called if any exception is thrown during swap + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "../helpers.h" +#include "check_assertion.h" + +template +void test_swap_exception_guarantee([[maybe_unused]] F&& swap_function) { + { + // key swap throws + using KeyContainer = ThrowOnMoveContainer; + using ValueContainer = std::vector; + using M = std::flat_multimap; + + M m1, m2; + m1.emplace(1, 1); + m1.emplace(1, 2); + m2.emplace(3, 3); + m2.emplace(3, 4); + // swap is noexcept + EXPECT_STD_TERMINATE([&] { swap_function(m1, m2); }); + } + + { + // value swap throws + using KeyContainer = std::vector; + using ValueContainer = ThrowOnMoveContainer; + using M = std::flat_multimap; + + M m1, m2; + m1.emplace(1, 1); + m1.emplace(1, 2); + m2.emplace(3, 3); + m2.emplace(3, 4); + + // swap is noexcept + EXPECT_STD_TERMINATE([&] { swap_function(m1, m2); }); + } +} + +int main(int, char**) { + { + auto swap_func = [](auto& m1, auto& m2) { swap(m1, m2); }; + test_swap_exception_guarantee(swap_func); + } + + { + auto swap_func = [](auto& m1, auto& m2) { m1.swap(m2); }; + test_swap_exception_guarantee(swap_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp new file mode 100644 index 0000000000000..f96155d714dc9 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// friend void swap(flat_multimap& x, flat_multimap& y) noexcept + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "test_macros.h" +#include "../helpers.h" + +// test noexcept + +template +concept NoExceptAdlSwap = requires(T t1, T t2) { + { swap(t1, t2) } noexcept; +}; + +static_assert(NoExceptAdlSwap>); + +#ifndef TEST_HAS_NO_EXCEPTIONS +static_assert(NoExceptAdlSwap< + std::flat_multimap, ThrowOnMoveContainer, ThrowOnMoveContainer>>); +#endif + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using V = std::pair; + + { + M m1; + M m2; + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + V ar2[] = {V(5, 5), V(5, 6), V(5, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(10, 12)}; + M m1; + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2; + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)}; + V ar2[] = {V(5, 5), V(5, 6), V(5, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(10, 12)}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp new file mode 100644 index 0000000000000..ab7be3b8ac22e --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// void swap(flat_multimap& y) noexcept; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "test_macros.h" +#include "../helpers.h" + +// test noexcept + +template +concept NoExceptMemberSwap = requires(T t1, T t2) { + { t1.swap(t2) } noexcept; +}; + +static_assert(NoExceptMemberSwap>); +#ifndef TEST_HAS_NO_EXCEPTIONS +static_assert(NoExceptMemberSwap< + std::flat_multimap, ThrowOnMoveContainer, ThrowOnMoveContainer>>); +#endif + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using V = std::pair; + { + M m1; + M m2; + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + V ar2[] = {V(5, 5), V(5, 6), V(7, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(12, 12)}; + M m1; + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2; + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)}; + V ar2[] = {V(5, 5), V(5, 6), V(7, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(12, 12)}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp new file mode 100644 index 0000000000000..47140132c6e47 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// key_compare key_comp() const; +// value_compare value_comp() const; + +#include +#include +#include +#include +#include + +#include "test_macros.h" + +int main(int, char**) { + { + using M = std::flat_multimap; + using Comp = std::less; // the default + M m = {}; + ASSERT_SAME_TYPE(M::key_compare, Comp); + static_assert(!std::is_same_v); + ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp); + ASSERT_SAME_TYPE(decltype(m.value_comp()), M::value_compare); + Comp kc = m.key_comp(); + assert(kc(1, 2)); + assert(!kc(2, 1)); + auto vc = m.value_comp(); + ASSERT_SAME_TYPE(decltype(vc(std::make_pair(1, 2), std::make_pair(1, 2))), bool); + assert(vc({1, '2'}, {2, '1'})); + assert(!vc({2, '1'}, {1, '2'})); + } + { + using Comp = std::function; + using M = std::flat_multimap; + Comp comp = std::greater(); + M m({}, comp); + ASSERT_SAME_TYPE(M::key_compare, Comp); + ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp); + ASSERT_SAME_TYPE(decltype(m.value_comp()), M::value_compare); + Comp kc = m.key_comp(); + assert(!kc(1, 2)); + assert(kc(2, 1)); + auto vc = m.value_comp(); + auto a = std::make_pair(1, 2); + ASSERT_SAME_TYPE(decltype(vc(a, a)), bool); + static_assert(!noexcept(vc(a, a))); + assert(!vc({1, 2}, {2, 1})); + assert(vc({2, 1}, {1, 2})); + } + { + using Comp = std::less<>; + using M = std::flat_multimap; + M m = {}; + ASSERT_SAME_TYPE(M::key_compare, Comp); + ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp); + ASSERT_SAME_TYPE(decltype(m.value_comp()), M::value_compare); + Comp kc = m.key_comp(); + assert(kc(1, 2)); + assert(!kc(2, 1)); + auto vc = m.value_comp(); + auto a = std::make_pair(1, 2); + ASSERT_SAME_TYPE(decltype(vc(a, a)), bool); + assert(vc({1, 2}, {2, 1})); + assert(!vc({2, 1}, {1, 2})); + } + { + using Comp = std::function&, const std::vector&)>; + using M = std::flat_multimap, int, Comp>; + Comp comp = [i = 1](const auto& x, const auto& y) { return x[i] < y[i]; }; + M m({}, comp); + auto vc = m.value_comp(); + static_assert(sizeof(vc) >= sizeof(Comp)); + comp = nullptr; + m = M({}, nullptr); + assert(m.key_comp() == nullptr); + // At this point, m.key_comp() is disengaged. + // But the std::function captured by copy inside `vc` remains valid. + auto a = std::make_pair(std::vector{2, 1, 4}, 42); + auto b = std::make_pair(std::vector{1, 2, 3}, 42); + auto c = std::make_pair(std::vector{0, 3, 2}, 42); + assert(vc(a, b)); + assert(vc(b, c)); + assert(!vc(b, a)); + assert(!vc(c, b)); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp new file mode 100644 index 0000000000000..c7c674c034bca --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// const key_container_type& keys() const noexcept +// const mapped_container_type& values() const noexcept + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "test_allocator.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + const M m = {{4, 'a'}, {2, 'b'}, {2, 'e'}, {3, 'c'}}; + std::same_as decltype(auto) keys = m.keys(); + std::same_as decltype(auto) values = m.values(); + + // noexcept + static_assert(noexcept(m.keys())); + static_assert(noexcept(m.values())); + + auto expected_keys = {2, 2, 3, 4}; + auto expected_values = {'b', 'e', 'c', 'a'}; + assert(std::ranges::equal(keys, expected_keys)); + assert(std::ranges::equal(values, expected_values)); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp new file mode 100644 index 0000000000000..b3ea0b65a3d93 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// bool contains(const key_type& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = {{1, 1}, {2, 2}, {2, 3}, {4, 4}, {5, 5}, {8, 1}, {8, 2}, {8, 8}}; + assert(!m.contains(0)); + assert(m.contains(1)); + assert(m.contains(2)); + assert(!m.contains(3)); + assert(m.contains(4)); + assert(m.contains(5)); + assert(!m.contains(6)); + assert(!m.contains(7)); + assert(std::as_const(m).contains(8)); + assert(!std::as_const(m).contains(9)); + m.clear(); + assert(!m.contains(1)); + } + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = {{1, 0}, {2, 0}, {4, 0}, {2, 1}, {5, 1}, {5, 2}, {5, 0}, {8, 0}}; + assert(!m.contains(0)); + assert(m.contains(1)); + assert(m.contains(2)); + assert(!m.contains(3)); + assert(m.contains(4)); + assert(m.contains(5)); + assert(!m.contains(6)); + assert(!m.contains(7)); + assert(std::as_const(m).contains(8)); + assert(!std::as_const(m).contains(9)); + m.clear(); + assert(!m.contains(1)); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp new file mode 100644 index 0000000000000..8a66ec63768d7 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template bool contains(const K& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanContains = requires(M m, Transparent k) { m.contains(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanContains); +static_assert(CanContains); +static_assert(!CanContains); +static_assert(!CanContains); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + M m = {{"alpha", 1}, {"beta", 2}, {"beta", 0}, {"epsilon", 3}, {"eta", 4}, {"eta", 1}, {"gamma", 5}}; + ASSERT_SAME_TYPE(decltype(m.contains(Transparent{"abc"})), bool); + ASSERT_SAME_TYPE(decltype(std::as_const(m).contains(Transparent{"b"})), bool); + assert(m.contains(Transparent{"alpha"}) == true); + assert(m.contains(Transparent{"beta"}) == true); + assert(m.contains(Transparent{"epsilon"}) == true); + assert(m.contains(Transparent{"eta"}) == true); + assert(m.contains(Transparent{"gamma"}) == true); + assert(m.contains(Transparent{"al"}) == false); + assert(m.contains(Transparent{""}) == false); + assert(m.contains(Transparent{"g"}) == false); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m(std::sorted_equivalent, {{1, 1}, {1, 2}, {2, 2}, {3, 3}}, c); + assert(!transparent_used); + auto b = m.contains(Transparent{3}); + assert(b); + assert(transparent_used); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp new file mode 100644 index 0000000000000..59b88428cde3c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// size_type count(const key_type& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = {{1, 1}, {2, 2}, {2, 2}, {4, 4}, {4, 1}, {4, 3}, {4, 4}, {5, 5}, {8, 8}}; + ASSERT_SAME_TYPE(decltype(m.count(0)), size_t); + assert(m.count(0) == 0); + assert(m.count(1) == 1); + assert(m.count(2) == 2); + assert(m.count(3) == 0); + assert(m.count(4) == 4); + assert(m.count(5) == 1); + assert(m.count(6) == 0); + assert(m.count(7) == 0); + assert(std::as_const(m).count(8) == 1); + assert(std::as_const(m).count(9) == 0); + } + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = {{1, 0}, {2, 0}, {4, 0}, {1, 0}, {1, 2}, {8, 1}, {5, 0}, {8, 0}}; + ASSERT_SAME_TYPE(decltype(m.count(0)), size_t); + assert(m.count(0) == 0); + assert(m.count(1) == 3); + assert(m.count(2) == 1); + assert(m.count(3) == 0); + assert(m.count(4) == 1); + assert(m.count(5) == 1); + assert(m.count(6) == 0); + assert(m.count(7) == 0); + assert(std::as_const(m).count(8) == 2); + assert(std::as_const(m).count(9) == 0); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp new file mode 100644 index 0000000000000..41f71065b2f75 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template size_type count(const K& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanCount = requires(M m, Transparent k) { m.count(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanCount); +static_assert(CanCount); +static_assert(!CanCount); +static_assert(!CanCount); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + M m = {{"alpha", 1}, + {"beta", 2}, + {"beta", 2}, + {"epsilon", 3}, + {"eta", 4}, + {"eta", 1}, + {"eta", 5}, + {"gamma", 6}, + {"gamma", 5}}; + ASSERT_SAME_TYPE(decltype(m.count(Transparent{"abc"})), typename M::size_type); + ASSERT_SAME_TYPE(decltype(std::as_const(m).count(Transparent{"b"})), typename M::size_type); + assert(m.count(Transparent{"alpha"}) == 1); + assert(m.count(Transparent{"beta"}) == 2); + assert(m.count(Transparent{"epsilon"}) == 1); + assert(m.count(Transparent{"eta"}) == 3); + assert(m.count(Transparent{"gamma"}) == 2); + assert(m.count(Transparent{"al"}) == 0); + assert(m.count(Transparent{""}) == 0); + assert(m.count(Transparent{"g"}) == 0); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m( + std::sorted_equivalent, {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 3}}, c); + assert(!transparent_used); + auto n = m.count(Transparent{3}); + assert(n == 2); + assert(transparent_used); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp new file mode 100644 index 0000000000000..ac369b77a7f3d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// pair equal_range(const key_type& k); +// pair equal_range(const key_type& k) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = std::pair; + using CR = std::pair; + M m = {{1, 'a'}, {1, 'a'}, {1, 'A'}, {2, 'b'}, {4, 'd'}, {5, 'E'}, {5, 'e'}, {8, 'h'}, {8, 'z'}}; + ASSERT_SAME_TYPE(decltype(m.equal_range(0)), R); + ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(0)), CR); + auto begin = m.begin(); + assert(m.equal_range(0) == std::pair(begin, begin)); + assert(m.equal_range(1) == std::pair(begin, begin + 3)); + assert(m.equal_range(2) == std::pair(begin + 3, begin + 4)); + assert(m.equal_range(3) == std::pair(begin + 4, begin + 4)); + assert(m.equal_range(4) == std::pair(begin + 4, begin + 5)); + assert(m.equal_range(5) == std::pair(begin + 5, begin + 7)); + assert(m.equal_range(6) == std::pair(begin + 7, begin + 7)); + assert(m.equal_range(7) == std::pair(begin + 7, begin + 7)); + assert(std::as_const(m).equal_range(8) == std::pair(m.cbegin() + 7, m.cbegin() + 9)); + assert(std::as_const(m).equal_range(9) == std::pair(m.cbegin() + 9, m.cbegin() + 9)); + } + + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + using R = std::pair; + using CR = std::pair; + M m = { + {1, 'a'}, {2, 'b'}, {2, 'b'}, {2, 'c'}, {4, 'a'}, {4, 'b'}, {4, 'c'}, {4, 'd'}, {5, 'e'}, {8, 'a'}, {8, 'h'}}; + ASSERT_SAME_TYPE(decltype(m.equal_range(0)), R); + ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(0)), CR); + auto begin = m.begin(); + assert(m.equal_range(0) == std::pair(begin + 11, begin + 11)); + assert(m.equal_range(1) == std::pair(begin + 10, begin + 11)); + assert(m.equal_range(2) == std::pair(begin + 7, begin + 10)); + assert(m.equal_range(3) == std::pair(begin + 7, begin + 7)); + assert(m.equal_range(4) == std::pair(begin + 3, begin + 7)); + assert(m.equal_range(5) == std::pair(begin + 2, begin + 3)); + assert(m.equal_range(6) == std::pair(begin + 2, begin + 2)); + assert(m.equal_range(7) == std::pair(begin + 2, begin + 2)); + assert(std::as_const(m).equal_range(8) == std::pair(m.cbegin(), m.cbegin() + 2)); + assert(std::as_const(m).equal_range(9) == std::pair(m.cbegin(), m.cbegin())); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp new file mode 100644 index 0000000000000..3666492bb921f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template pair equal_range(const K& x); +// template pair equal_range(const K& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanEqualRange = requires(M m, Transparent k) { m.equal_range(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanEqualRange); +static_assert(CanEqualRange); +static_assert(!CanEqualRange); +static_assert(!CanEqualRange); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + using R = std::pair; + using CR = std::pair; + M m = {{"alpha", 1}, + {"alpha", 1}, + {"alpha", 3}, + {"beta", 2}, + {"epsilon", 3}, + {"epsilon", 0}, + {"eta", 4}, + {"gamma", 5}, + {"gamma", 1}}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.equal_range(Transparent{"abc"})), R); + ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(Transparent{"b"})), CR); + + auto test_found = [&](auto&& map, const auto& expected_key, std::initializer_list expected_values) { + auto [first, last] = map.equal_range(Transparent{expected_key}); + auto expected_range = + expected_values | std::views::transform([&](auto&& val) { return std::pair(expected_key, val); }); + assert(std::ranges::equal(std::ranges::subrange(first, last), expected_range)); + }; + + auto test_not_found = [&](auto&& map, const std::string& expected_key, long expected_offset) { + auto [first, last] = map.equal_range(Transparent{expected_key}); + assert(first == last); + assert(first - m.begin() == expected_offset); + }; + + test_found(m, "alpha", {1, 1, 3}); + test_found(m, "beta", {2}); + test_found(m, "epsilon", {3, 0}); + test_found(m, "eta", {4}); + test_found(m, "gamma", {5, 1}); + test_found(cm, "alpha", {1, 1, 3}); + test_found(cm, "beta", {2}); + test_found(cm, "epsilon", {3, 0}); + test_found(cm, "eta", {4}); + test_found(cm, "gamma", {5, 1}); + + test_not_found(m, "charlie", 4); + test_not_found(m, "aaa", 0); + test_not_found(m, "zzz", 9); + test_not_found(cm, "charlie", 4); + test_not_found(cm, "aaa", 0); + test_not_found(cm, "zzz", 9); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 1}, {3, 3}}, c); + assert(!transparent_used); + auto p = m.equal_range(Transparent{3}); + assert(p.first == m.begin() + 2); + assert(p.second == m.end()); + assert(transparent_used); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp new file mode 100644 index 0000000000000..74b7051eb0d7b --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator find(const key_type& k); +// const_iterator find(const key_type& k) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap, KeyContainer, ValueContainer>; + + M m = {{1, 'a'}, {1, 'a'}, {1, 'b'}, {2, 'c'}, {2, 'b'}, {4, 'a'}, {4, 'd'}, {5, 'e'}, {8, 'a'}, {8, 'h'}}; + ASSERT_SAME_TYPE(decltype(m.find(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).find(0)), typename M::const_iterator); + assert(m.find(0) == m.end()); + assert(m.find(1) == m.begin()); + assert(m.find(2) == m.begin() + 3); + assert(m.find(3) == m.end()); + assert(m.find(4) == m.begin() + 5); + assert(m.find(5) == m.begin() + 7); + assert(m.find(6) == m.end()); + assert(m.find(7) == m.end()); + assert(std::as_const(m).find(8) == m.begin() + 8); + assert(std::as_const(m).find(9) == m.end()); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp new file mode 100644 index 0000000000000..be8c6f2e35440 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template iterator find(const K& x); +// template const_iterator find(const K& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanFind = requires(M m, Transparent k) { m.find(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanFind); +static_assert(CanFind); +static_assert(!CanFind); +static_assert(!CanFind); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + M m = {{"alpha", 1}, + {"beta", 2}, + {"beta", 0}, + {"beta", 1}, + {"beta", 2}, + {"epsilon", 3}, + {"epsilon", 1}, + {"eta", 4}, + {"gamma", 6}, + {"gamma", 5}}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.find(Transparent{"abc"})), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).find(Transparent{"b"})), typename M::const_iterator); + + auto test_find = [&](auto&& map, const std::string& expected_key, long expected_offset) { + auto iter = map.find(Transparent{expected_key}); + assert(iter - map.begin() == expected_offset); + }; + + test_find(m, "alpha", 0); + test_find(m, "beta", 1); + test_find(m, "epsilon", 5); + test_find(m, "eta", 7); + test_find(m, "gamma", 8); + test_find(m, "charlie", 10); + test_find(m, "aaa", 10); + test_find(m, "zzz", 10); + test_find(cm, "alpha", 0); + test_find(cm, "beta", 1); + test_find(cm, "epsilon", 5); + test_find(cm, "eta", 7); + test_find(cm, "gamma", 8); + test_find(cm, "charlie", 10); + test_find(cm, "aaa", 10); + test_find(cm, "zzz", 10); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 3}, {3, 3}}, c); + assert(!transparent_used); + auto it = m.find(Transparent{3}); + assert(it != m.end()); + assert(transparent_used); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp new file mode 100644 index 0000000000000..c3befdda7de6e --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator lower_bound(const key_type& k); +// const_iterator lower_bound(const key_type& k) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = {{1, 'a'}, {2, 'a'}, {2, 'c'}, {2, 'b'}, {4, 'd'}, {5, 'a'}, {5, 'e'}, {8, 'h'}, {8, 'a'}}; + ASSERT_SAME_TYPE(decltype(m.lower_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(0)), typename M::const_iterator); + assert(m.lower_bound(0) == m.begin()); + assert(m.lower_bound(1) == m.begin()); + assert(m.lower_bound(2) == m.begin() + 1); + assert(m.lower_bound(3) == m.begin() + 4); + assert(m.lower_bound(4) == m.begin() + 4); + assert(m.lower_bound(5) == m.begin() + 5); + assert(m.lower_bound(6) == m.begin() + 7); + assert(m.lower_bound(7) == m.begin() + 7); + assert(std::as_const(m).lower_bound(8) == m.begin() + 7); + assert(std::as_const(m).lower_bound(9) == m.end()); + } + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = {{1, 'a'}, {1, 'b'}, {2, 'b'}, {4, 'd'}, {4, 'a'}, {4, 'e'}, {5, 'e'}, {8, 'a'}, {8, 'h'}}; + ASSERT_SAME_TYPE(decltype(m.lower_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(0)), typename M::const_iterator); + assert(m.lower_bound(0) == m.end()); + assert(m.lower_bound(1) == m.begin() + 7); + assert(m.lower_bound(2) == m.begin() + 6); + assert(m.lower_bound(3) == m.begin() + 6); + assert(m.lower_bound(4) == m.begin() + 3); + assert(m.lower_bound(5) == m.begin() + 2); + assert(m.lower_bound(6) == m.begin() + 2); + assert(m.lower_bound(7) == m.begin() + 2); + assert(std::as_const(m).lower_bound(8) == m.begin()); + assert(std::as_const(m).lower_bound(9) == m.begin()); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp new file mode 100644 index 0000000000000..b757af132e677 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp @@ -0,0 +1,107 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template iterator lower_bound(const K& x); +// template const_iterator lower_bound(const K& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanLowerBound = requires(M m, Transparent k) { m.lower_bound(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanLowerBound); +static_assert(CanLowerBound); +static_assert(!CanLowerBound); +static_assert(!CanLowerBound); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + M m = {{"alpha", 1}, + {"alpha", 2}, + {"alpha", 3}, + {"beta", 2}, + {"epsilon", 3}, + {"epsilon", 4}, + {"eta", 4}, + {"gamma", 5}, + {"gamma", 5}, + {"gamma", 5}, + {"gamma", 5}}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.lower_bound(Transparent{"abc"})), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(Transparent{"b"})), typename M::const_iterator); + + auto test_lower_bound = [&](auto&& map, const std::string& expected_key, long expected_offset) { + auto iter = map.lower_bound(Transparent{expected_key}); + assert(iter - map.begin() == expected_offset); + }; + + test_lower_bound(m, "abc", 0); + test_lower_bound(m, "alpha", 0); + test_lower_bound(m, "beta", 3); + test_lower_bound(m, "bets", 4); + test_lower_bound(m, "charlie", 4); + test_lower_bound(m, "echo", 4); + test_lower_bound(m, "epsilon", 4); + test_lower_bound(m, "eta", 6); + test_lower_bound(m, "gamma", 7); + test_lower_bound(m, "golf", 11); + test_lower_bound(m, "zzz", 11); + + test_lower_bound(cm, "abc", 0); + test_lower_bound(cm, "alpha", 0); + test_lower_bound(cm, "beta", 3); + test_lower_bound(cm, "bets", 4); + test_lower_bound(cm, "charlie", 4); + test_lower_bound(cm, "echo", 4); + test_lower_bound(cm, "epsilon", 4); + test_lower_bound(cm, "eta", 6); + test_lower_bound(cm, "gamma", 7); + test_lower_bound(cm, "golf", 11); + test_lower_bound(cm, "zzz", 11); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 3}}, c); + assert(!transparent_used); + auto it = m.lower_bound(Transparent{3}); + assert(it != m.end()); + assert(transparent_used); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp new file mode 100644 index 0000000000000..d73d030236e22 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp @@ -0,0 +1,76 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// iterator upper_bound(const key_type& k); +// const_iterator upper_bound(const key_type& k) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = { + {1, 'a'}, {2, 'b'}, {4, 'd'}, {4, 'e'}, {4, 'a'}, {4, 'b'}, {5, 'e'}, {5, 'a'}, {8, 'a'}, {8, 'b'}, {8, 'h'}}; + ASSERT_SAME_TYPE(decltype(m.upper_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).upper_bound(0)), typename M::const_iterator); + assert(m.upper_bound(0) == m.begin()); + assert(m.upper_bound(1) == m.begin() + 1); + assert(m.upper_bound(2) == m.begin() + 2); + assert(m.upper_bound(3) == m.begin() + 2); + assert(m.upper_bound(4) == m.begin() + 6); + assert(m.upper_bound(5) == m.begin() + 8); + assert(m.upper_bound(6) == m.begin() + 8); + assert(std::as_const(m).upper_bound(7) == m.begin() + 8); + assert(std::as_const(m).upper_bound(8) == m.end()); + assert(std::as_const(m).upper_bound(9) == m.end()); + } + + { + using M = std::flat_multimap, KeyContainer, ValueContainer>; + M m = { + {1, 'a'}, {2, 'b'}, {4, 'd'}, {4, 'e'}, {4, 'a'}, {4, 'b'}, {5, 'e'}, {5, 'a'}, {8, 'a'}, {8, 'b'}, {8, 'h'}}; + ASSERT_SAME_TYPE(decltype(m.upper_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).upper_bound(0)), typename M::const_iterator); + assert(m.upper_bound(0) == m.end()); + assert(m.upper_bound(1) == m.end()); + assert(m.upper_bound(2) == m.begin() + 10); + assert(m.upper_bound(3) == m.begin() + 9); + assert(m.upper_bound(4) == m.begin() + 9); + assert(m.upper_bound(5) == m.begin() + 5); + assert(m.upper_bound(6) == m.begin() + 3); + assert(m.upper_bound(7) == m.begin() + 3); + assert(std::as_const(m).upper_bound(8) == m.begin() + 3); + assert(std::as_const(m).upper_bound(9) == m.begin()); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp new file mode 100644 index 0000000000000..969489d0fe619 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp @@ -0,0 +1,106 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// template iterator upper_bound(const K& x); +// template const_iterator upper_bound(const K& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanUpperBound = requires(M m, Transparent k) { m.upper_bound(k); }; +using TransparentMap = std::flat_multimap; +using NonTransparentMap = std::flat_multimap; +static_assert(CanUpperBound); +static_assert(CanUpperBound); +static_assert(!CanUpperBound); +static_assert(!CanUpperBound); + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + using M = std::flat_multimap; + + M m = {{"alpha", 1}, + {"alpha", 2}, + {"alpha", 3}, + {"beta", 2}, + {"epsilon", 3}, + {"epsilon", 4}, + {"eta", 4}, + {"gamma", 5}, + {"gamma", 5}, + {"gamma", 5}, + {"gamma", 5}}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.lower_bound(Transparent{"abc"})), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(Transparent{"b"})), typename M::const_iterator); + + auto test_upper_bound = [&](auto&& map, const std::string& expected_key, long expected_offset) { + auto iter = map.upper_bound(Transparent{expected_key}); + assert(iter - map.begin() == expected_offset); + }; + + test_upper_bound(m, "abc", 0); + test_upper_bound(m, "alpha", 3); + test_upper_bound(m, "beta", 4); + test_upper_bound(m, "bets", 4); + test_upper_bound(m, "charlie", 4); + test_upper_bound(m, "echo", 4); + test_upper_bound(m, "epsilon", 6); + test_upper_bound(m, "eta", 7); + test_upper_bound(m, "gamma", 11); + test_upper_bound(m, "golf", 11); + test_upper_bound(m, "zzz", 11); + + test_upper_bound(cm, "abc", 0); + test_upper_bound(cm, "alpha", 3); + test_upper_bound(cm, "beta", 4); + test_upper_bound(cm, "bets", 4); + test_upper_bound(cm, "charlie", 4); + test_upper_bound(cm, "echo", 4); + test_upper_bound(cm, "epsilon", 6); + test_upper_bound(cm, "eta", 7); + test_upper_bound(cm, "gamma", 11); + test_upper_bound(cm, "golf", 11); + test_upper_bound(cm, "zzz", 11); +} + +int main(int, char**) { + test, std::vector>(); + test, std::vector>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multimap m(std::sorted_equivalent, {{1, 1}, {2, 2}, {2, 2}, {3, 3}}, c); + assert(!transparent_used); + auto it = m.upper_bound(Transparent{2}); + assert(it == m.begin() + 3); + assert(transparent_used); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h new file mode 100644 index 0000000000000..252e2454d497c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h @@ -0,0 +1,389 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SUPPORT_FLAT_MULTIMAP_HELPERS_H +#define SUPPORT_FLAT_MULTIMAP_HELPERS_H + +#include +#include +#include +#include +#include + +#include "test_allocator.h" +#include "test_macros.h" + +template +void check_invariant(const std::flat_multimap& m) { + assert(m.keys().size() == m.values().size()); + const auto& keys = m.keys(); + assert(std::is_sorted(keys.begin(), keys.end(), m.key_comp())); +} + +struct StartsWith { + explicit StartsWith(char ch) : lower_(1, ch), upper_(1, ch + 1) {} + StartsWith(const StartsWith&) = delete; + void operator=(const StartsWith&) = delete; + struct Less { + using is_transparent = void; + bool operator()(const std::string& a, const std::string& b) const { return a < b; } + bool operator()(const StartsWith& a, const std::string& b) const { return a.upper_ <= b; } + bool operator()(const std::string& a, const StartsWith& b) const { return a < b.lower_; } + bool operator()(const StartsWith&, const StartsWith&) const { + assert(false); // should not be called + return false; + } + }; + +private: + std::string lower_; + std::string upper_; +}; + +template +struct CopyOnlyVector : std::vector { + using std::vector::vector; + + CopyOnlyVector(const CopyOnlyVector&) = default; + CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {} + CopyOnlyVector(CopyOnlyVector&& other, std::vector::allocator_type alloc) : CopyOnlyVector(other, alloc) {} + + CopyOnlyVector& operator=(const CopyOnlyVector&) = default; + CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); } +}; + +template +struct Transparent { + T t; + + operator T() const + requires ConvertibleToT + { + return t; + } +}; + +template +using ConvertibleTransparent = Transparent; + +template +using NonConvertibleTransparent = Transparent; + +struct TransparentComparator { + using is_transparent = void; + + bool* transparent_used = nullptr; + TransparentComparator() = default; + TransparentComparator(bool& used) : transparent_used(&used) {} + + template + bool operator()(const T& t, const Transparent& transparent) const { + if (transparent_used != nullptr) { + *transparent_used = true; + } + return t < transparent.t; + } + + template + bool operator()(const Transparent& transparent, const T& t) const { + if (transparent_used != nullptr) { + *transparent_used = true; + } + return transparent.t < t; + } + + template + bool operator()(const T& t1, const T& t2) const { + return t1 < t2; + } +}; + +struct NonTransparentComparator { + template + bool operator()(const T&, const Transparent&) const; + + template + bool operator()(const Transparent&, const T&) const; + + template + bool operator()(const T&, const T&) const; +}; + +struct NoDefaultCtr { + NoDefaultCtr() = delete; +}; + +#ifndef TEST_HAS_NO_EXCEPTIONS +template +struct EmplaceUnsafeContainer : std::vector { + using std::vector::vector; + + template + auto emplace(Args&&... args) -> decltype(std::declval>().emplace(std::forward(args)...)) { + if (this->size() > 1) { + auto it1 = this->begin(); + auto it2 = it1 + 1; + // messing up the container + std::iter_swap(it1, it2); + } + + throw 42; + } + + template + auto insert(Args&&... args) -> decltype(std::declval>().insert(std::forward(args)...)) { + if (this->size() > 1) { + auto it1 = this->begin(); + auto it2 = it1 + 1; + // messing up the container + std::iter_swap(it1, it2); + } + + throw 42; + } +}; + +template +struct ThrowOnEraseContainer : std::vector { + using std::vector::vector; + + template + auto erase(Args&&... args) -> decltype(std::declval>().erase(std::forward(args)...)) { + throw 42; + } +}; + +template +struct ThrowOnMoveContainer : std::vector { + using std::vector::vector; + + ThrowOnMoveContainer(ThrowOnMoveContainer&&) { throw 42; } + + ThrowOnMoveContainer& operator=(ThrowOnMoveContainer&&) { throw 42; } +}; + +#endif + +template +void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) { +#ifndef TEST_HAS_NO_EXCEPTIONS + using C = TransparentComparator; + { + // Throw on emplace the key, and underlying has strong exception guarantee + using KeyContainer = std::vector>; + using M = std::flat_multimap; + + LIBCPP_STATIC_ASSERT(std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + + test_allocator_statistics stats; + + KeyContainer a({1, 1, 2, 4}, test_allocator{&stats}); + std::vector b = {5, 6, 7, 8}; + [[maybe_unused]] auto expected_keys = a; + [[maybe_unused]] auto expected_values = b; + M m(std::sorted_equivalent, std::move(a), std::move(b)); + + stats.throw_after = 1; + try { + emplace_function(m, 1, 1); + assert(false); + } catch (const std::bad_alloc&) { + check_invariant(m); + // In libc++, the flat_multimap is unchanged + LIBCPP_ASSERT(m.size() == 4); + LIBCPP_ASSERT(m.keys() == expected_keys); + LIBCPP_ASSERT(m.values() == expected_values); + } + } + { + // Throw on emplace the key, and underlying has no strong exception guarantee + using KeyContainer = EmplaceUnsafeContainer; + using M = std::flat_multimap; + + LIBCPP_STATIC_ASSERT(!std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + KeyContainer a = {1, 2, 2, 4}; + std::vector b = {5, 6, 7, 8}; + M m(std::sorted_equivalent, std::move(a), std::move(b)); + try { + emplace_function(m, 1, 1); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, the flat_multimap is cleared + LIBCPP_ASSERT(m.size() == 0); + } + } + { + // Throw on emplace the value, and underlying has strong exception guarantee + using ValueContainer = std::vector>; + ; + using M = std::flat_multimap, ValueContainer>; + + LIBCPP_STATIC_ASSERT(std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + + std::vector a = {1, 3, 3, 4}; + test_allocator_statistics stats; + ValueContainer b({1, 2, 3, 4}, test_allocator{&stats}); + + [[maybe_unused]] auto expected_keys = a; + [[maybe_unused]] auto expected_values = b; + M m(std::sorted_equivalent, std::move(a), std::move(b)); + + stats.throw_after = 1; + try { + emplace_function(m, 3, 3); + assert(false); + } catch (const std::bad_alloc&) { + check_invariant(m); + // In libc++, the emplaced key is erased and the flat_multimap is unchanged + LIBCPP_ASSERT(m.size() == 4); + LIBCPP_ASSERT(m.keys() == expected_keys); + LIBCPP_ASSERT(m.values() == expected_values); + } + } + { + // Throw on emplace the value, and underlying has no strong exception guarantee + using ValueContainer = EmplaceUnsafeContainer; + using M = std::flat_multimap, ValueContainer>; + + LIBCPP_STATIC_ASSERT(!std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + std::vector a = {1, 1, 1, 1}; + ValueContainer b = {1, 2, 3, 4}; + + M m(std::sorted_equivalent, std::move(a), std::move(b)); + + try { + emplace_function(m, 1, 5); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, the flat_multimap is cleared + LIBCPP_ASSERT(m.size() == 0); + } + } + { + // Throw on emplace the value, then throw again on erasing the key + using KeyContainer = ThrowOnEraseContainer; + using ValueContainer = std::vector>; + using M = std::flat_multimap; + + LIBCPP_STATIC_ASSERT(std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + + KeyContainer a = {4, 4, 4, 4}; + test_allocator_statistics stats; + ValueContainer b({1, 2, 3, 4}, test_allocator{&stats}); + + M m(std::sorted_equivalent, std::move(a), std::move(b)); + stats.throw_after = 1; + try { + emplace_function(m, 0, 0); + assert(false); + } catch (const std::bad_alloc&) { + check_invariant(m); + // In libc++, we try to erase the key after value emplacement failure. + // and after erasure failure, we clear the flat_multimap + LIBCPP_ASSERT(m.size() == 0); + } + } +#endif +} + +template +void test_insert_range_exception_guarantee([[maybe_unused]] F&& insert_function) { +#ifndef TEST_HAS_NO_EXCEPTIONS + using KeyContainer = EmplaceUnsafeContainer; + using ValueContainer = std::vector; + using M = std::flat_multimap; + test_allocator_statistics stats; + KeyContainer a{1, 2, 3, 4}; + ValueContainer b{1, 2, 3, 4}; + M m(std::sorted_equivalent, std::move(a), std::move(b)); + + std::vector> newValues = {{0, 0}, {1, 1}, {5, 5}, {6, 6}, {7, 7}, {8, 8}}; + stats.throw_after = 1; + try { + insert_function(m, newValues); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear if anything goes wrong when inserting a range + LIBCPP_ASSERT(m.size() == 0); + } +#endif +} + +template +void test_erase_exception_guarantee([[maybe_unused]] F&& erase_function) { +#ifndef TEST_HAS_NO_EXCEPTIONS + { + // key erase throws + using KeyContainer = ThrowOnEraseContainer; + using ValueContainer = std::vector; + using M = std::flat_multimap; + + KeyContainer a{1, 3, 3, 4}; + ValueContainer b{1, 3, 3, 4}; + M m(std::sorted_equivalent, std::move(a), std::move(b)); + try { + erase_function(m, 3); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear if anything goes wrong when erasing + LIBCPP_ASSERT(m.size() == 0); + } + } + { + // key erase throws + using KeyContainer = std::vector; + using ValueContainer = ThrowOnEraseContainer; + using M = std::flat_multimap; + + KeyContainer a{1, 3, 3, 4}; + ValueContainer b{1, 3, 3, 4}; + M m(std::sorted_equivalent, std::move(a), std::move(b)); + try { + erase_function(m, 3); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear if anything goes wrong when erasing + LIBCPP_ASSERT(m.size() == 0); + } + } +#endif +} +class Moveable { + int int_; + double double_; + +public: + Moveable() : int_(0), double_(0) {} + Moveable(int i, double d) : int_(i), double_(d) {} + Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) { + x.int_ = -1; + x.double_ = -1; + } + Moveable& operator=(Moveable&& x) { + int_ = x.int_; + x.int_ = -1; + double_ = x.double_; + x.double_ = -1; + return *this; + } + + Moveable(const Moveable&) = delete; + Moveable& operator=(const Moveable&) = delete; + bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; } + bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); } + + int get() const { return int_; } + bool moved() const { return int_ == -1; } +}; + +#endif // SUPPORT_FLAT_MULTIMAP_HELPERS_H diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp new file mode 100644 index 0000000000000..e4325b1dfe3ba --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// Check that std::flat_multimap and its iterators can be instantiated with an incomplete +// type. + +#include +#include + +struct A { + using Map = std::flat_multimap; + int data; + Map m; + Map::iterator it; + Map::const_iterator cit; +}; + +// Implement the operator< required in order to instantiate flat_multimap +bool operator<(A const& L, A const& R) { return L.data < R.data; } + +int main(int, char**) { + A a; + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp new file mode 100644 index 0000000000000..680ff1a127dda --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp @@ -0,0 +1,133 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multimap + +// friend bool operator==(const flat_multimap& x, const flat_multimap& y); +// friend synth-three-way-result +// operator<=>(const flat_multimap& x, const flat_multimap& y); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_comparisons.h" +#include "test_container_comparisons.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using Value = typename ValueContainer::value_type; + + { + using C = std::flat_multimap; + C s1 = {{1, 1}}; + C s2 = {{2, 0}}; // {{1,1}} versus {{2,0}} + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::strong_ordering); + AssertComparisonsReturnBool(); + assert(testComparisons(s1, s2, false, true)); + s2 = {{1, 1}}; // {{1,1}} versus {{1,1}} + assert(testComparisons(s1, s2, true, false)); + s2 = {{1, 1}, {2, 0}}; // {{1,1}} versus {{1,1},{2,0}} + assert(testComparisons(s1, s2, false, true)); + s1 = {{0, 0}, {1, 1}, {2, 2}}; // {{0,0},{1,1},{2,2}} versus {{1,1},{2,0}} + assert(testComparisons(s1, s2, false, true)); + s2 = {{0, 0}, {1, 1}, {2, 3}}; // {{0,0},{1,1},{2,2}} versus {{0,0},{1,1},{2,3}} + assert(testComparisons(s1, s2, false, true)); + + s1 = {{1, 1}, {1, 1}}; + s2 = {{1, 1}, {1, 1}}; + assert(testComparisons(s1, s2, true, false)); + + s2 = {{1, 1}, {1, 1}, {2, 2}}; + assert(testComparisons(s1, s2, false, true)); + + s2 = {{1, 1}, {2, 2}, {2, 2}}; + assert(testComparisons(s1, s2, false, true)); + + s2 = {{0, 0}, {1, 1}, {1, 1}}; + assert(testComparisons(s1, s2, false, false)); + } + { + // Comparisons use value_type's native operators, not the comparator + using C = std::flat_multimap>; + C s1 = {{1, 1}}; + C s2 = {{2, 0}}; // {{1,1}} versus {{2,0}} + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::strong_ordering); + AssertComparisonsReturnBool(); + assert(testComparisons(s1, s2, false, true)); + s2 = {{1, 1}}; // {{1,1}} versus {{1,1}} + assert(testComparisons(s1, s2, true, false)); + s2 = {{1, 1}, {2, 0}}; // {{1,1}} versus {{2,0},{1,1}} + assert(testComparisons(s1, s2, false, true)); + s1 = {{0, 0}, {1, 1}, {2, 2}}; // {{2,2},{1,1},{0,0}} versus {2,0},{1,1}} + assert(testComparisons(s1, s2, false, false)); + s2 = {{0, 0}, {1, 1}, {2, 3}}; // {{2,2},{1,1},{0,0}} versus {{2,3},{1,1},{0,0}} + assert(testComparisons(s1, s2, false, true)); + } +} + +int main(int, char**) { + test, std::vector>(); + test, std::deque>(); + test, MinSequenceContainer>(); + test>, std::vector>>(); + test>, std::vector>>(); + + { + using C = std::flat_multimap; + C s1 = {{1, 1}}; + C s2 = C(std::sorted_equivalent, {{std::numeric_limits::quiet_NaN(), 2}}); + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering); + AssertComparisonsReturnBool(); + assert(testComparisonsComplete(s1, s2, false, false, false)); + } + { + using C = std::flat_multimap; + C s1 = {{1, 1}}; + C s2 = C(std::sorted_equivalent, {{2, std::numeric_limits::quiet_NaN()}}); + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering); + AssertComparisonsReturnBool(); + assert(testComparisonsComplete(s1, s2, false, true, false)); + s2 = C(std::sorted_equivalent, {{1, std::numeric_limits::quiet_NaN()}}); + assert(testComparisonsComplete(s1, s2, false, false, false)); + } + { + // Comparisons use value_type's native operators, not the comparator + struct StrongComp { + bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; } + }; + using C = std::flat_multimap; + C s1 = {{1, 1}}; + C s2 = {{std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}}; + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering); + AssertComparisonsReturnBool(); + assert(testComparisonsComplete(s1, s2, false, false, false)); + s1 = {{{1, 1}, {std::numeric_limits::quiet_NaN(), 1}}}; + s2 = {{{std::numeric_limits::quiet_NaN(), 1}, {1, 1}}}; + assert(std::lexicographical_compare_three_way( + s1.keys().begin(), s1.keys().end(), s2.keys().begin(), s2.keys().end(), std::strong_order) == + std::strong_ordering::equal); + assert(s1 != s2); + assert((s1 <=> s2) == std::partial_ordering::unordered); + } + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp new file mode 100644 index 0000000000000..490d51c299793 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp @@ -0,0 +1,133 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// using key_type = Key; +// using mapped_type = T; +// using value_type = pair; +// using key_compare = Compare; +// using reference = pair; +// using const_reference = pair; +// using size_type = size_t; +// using difference_type = ptrdiff_t; +// using iterator = implementation-defined; // see [container.requirements] +// using const_iterator = implementation-defined; // see [container.requirements] +// using reverse_iterator = std::reverse_iterator; +// using const_reverse_iterator = std::reverse_iterator; +// using key_container_type = KeyContainer; +// using mapped_container_type = MappedContainer; + +// class value_compare; + +// struct containers { +// key_container_type keys; +// mapped_container_type values; +// }; + +#include +#include +#include +#include +#include +#include +#include +#include "min_allocator.h" + +void test() { + { + using M = std::flat_multimap; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(requires { typename M::iterator; }); + static_assert(requires { typename M::const_iterator; }); + static_assert(std::is_same_v>); + static_assert( + std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(requires { typename M::value_compare; }); + static_assert(requires { typename M::containers; }); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + } + + { + struct A {}; + struct Compare { + bool operator()(const std::string&, const std::string&) const; + }; + using M = std::flat_multimap, std::deque>; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(requires { typename M::iterator; }); + static_assert(requires { typename M::const_iterator; }); + static_assert(std::is_same_v>); + static_assert( + std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(requires { typename M::value_compare; }); + static_assert(requires { typename M::containers; }); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + } + { + using C = std::flat_multimap; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(!std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + } + { + using C = std::flat_multimap, std::deque>>; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(!std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + // size_type is invariably size_t + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>); + } +} diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp new file mode 100644 index 0000000000000..0add849312d5e --- /dev/null +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// WARNING: This test was generated by generate_feature_test_macro_components.py +// and should not be edited manually. +// +// clang-format off + +// + +// Test the feature test macros defined by + +/* Constant Value + __cpp_lib_flat_map 202207L [C++23] +*/ + +#include +#include "test_macros.h" + +#if TEST_STD_VER < 14 + +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +#elif TEST_STD_VER == 14 + +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +#elif TEST_STD_VER == 17 + +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +#elif TEST_STD_VER == 20 + +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +#elif TEST_STD_VER == 23 + +# ifndef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should be defined in c++23" +# endif +# if __cpp_lib_flat_map != 202207L +# error "__cpp_lib_flat_map should have the value 202207L in c++23" +# endif + +#elif TEST_STD_VER > 23 + +# ifndef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should be defined in c++26" +# endif +# if __cpp_lib_flat_map != 202207L +# error "__cpp_lib_flat_map should have the value 202207L in c++26" +# endif + +#endif // TEST_STD_VER > 23 + diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 605788f559d3c..8f5788d2bed20 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -88,6 +88,8 @@ 201902L [C++20] __cpp_lib_expected 202211L [C++23] __cpp_lib_filesystem 201703L [C++17] + __cpp_lib_flat_map 202207L [C++23] + __cpp_lib_flat_set 202207L [C++23] __cpp_lib_format 202110L [C++20] __cpp_lib_format_path 202403L [C++26] __cpp_lib_format_ranges 202207L [C++23] @@ -528,6 +530,14 @@ # error "__cpp_lib_filesystem should not be defined before c++17" # endif +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +# ifdef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should not be defined before c++23" +# endif + # ifdef __cpp_lib_format # error "__cpp_lib_format should not be defined before c++20" # endif @@ -1399,6 +1409,14 @@ # error "__cpp_lib_filesystem should not be defined before c++17" # endif +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +# ifdef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should not be defined before c++23" +# endif + # ifdef __cpp_lib_format # error "__cpp_lib_format should not be defined before c++20" # endif @@ -2390,6 +2408,14 @@ # endif # endif +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +# ifdef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should not be defined before c++23" +# endif + # ifdef __cpp_lib_format # error "__cpp_lib_format should not be defined before c++20" # endif @@ -3651,6 +3677,14 @@ # endif # endif +# ifdef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should not be defined before c++23" +# endif + +# ifdef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should not be defined before c++23" +# endif + # ifndef __cpp_lib_format # error "__cpp_lib_format should be defined in c++20" # endif @@ -5092,6 +5126,26 @@ # endif # endif +# ifndef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should be defined in c++23" +# endif +# if __cpp_lib_flat_map != 202207L +# error "__cpp_lib_flat_map should have the value 202207L in c++23" +# endif + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should be defined in c++23" +# endif +# if __cpp_lib_flat_set != 202207L +# error "__cpp_lib_flat_set should have the value 202207L in c++23" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!" +# endif +# endif + # ifndef __cpp_lib_format # error "__cpp_lib_format should be defined in c++23" # endif @@ -6779,6 +6833,26 @@ # endif # endif +# ifndef __cpp_lib_flat_map +# error "__cpp_lib_flat_map should be defined in c++26" +# endif +# if __cpp_lib_flat_map != 202207L +# error "__cpp_lib_flat_map should have the value 202207L in c++26" +# endif + +# if !defined(_LIBCPP_VERSION) +# ifndef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should be defined in c++26" +# endif +# if __cpp_lib_flat_set != 202207L +# error "__cpp_lib_flat_set should have the value 202207L in c++26" +# endif +# else // _LIBCPP_VERSION +# ifdef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!" +# endif +# endif + # ifndef __cpp_lib_format # error "__cpp_lib_format should be defined in c++26" # endif diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index c4065cdc1afef..58ecd79cf7469 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -505,6 +505,17 @@ def add_version_header(tc): "test_suite_guard": "!defined(_LIBCPP_VERSION) || (_LIBCPP_HAS_FILESYSTEM && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)", "libcxx_guard": "_LIBCPP_HAS_FILESYSTEM && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY", }, + { + "name": "__cpp_lib_flat_map", + "values": {"c++23": 202207}, + "headers": ["flat_map"], + }, + { + "name": "__cpp_lib_flat_set", + "values": {"c++23": 202207}, + "headers": ["flat_set"], + "unimplemented": True, + }, { "name": "__cpp_lib_format", "values": { From d578d0bb135ca337b14aabe6696fe5b0a0932932 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sat, 25 Jan 2025 18:30:36 +0000 Subject: [PATCH 100/432] [gn build] Port def50f701f6a --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 90303821eb09f..f118d22c472d8 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1325,8 +1325,11 @@ if (current_toolchain == default_toolchain) { "__filesystem/space_info.h", "__filesystem/u8path.h", "__flat_map/flat_map.h", + "__flat_map/flat_multimap.h", "__flat_map/key_value_iterator.h", + "__flat_map/sorted_equivalent.h", "__flat_map/sorted_unique.h", + "__flat_map/utils.h", "__format/buffer.h", "__format/concepts.h", "__format/container_adaptor.h", From 2655ae54db6d7e9276a5ef4208cbeff1ae2ee72c Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Sat, 25 Jan 2025 13:52:07 -0500 Subject: [PATCH 101/432] [mlir] Fix deprecated pointer union casts in toy example (#124422) --- mlir/examples/toy/Ch4/mlir/Dialect.cpp | 2 +- mlir/examples/toy/Ch5/mlir/Dialect.cpp | 2 +- mlir/examples/toy/Ch6/mlir/Dialect.cpp | 2 +- mlir/examples/toy/Ch7/mlir/Dialect.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/examples/toy/Ch4/mlir/Dialect.cpp b/mlir/examples/toy/Ch4/mlir/Dialect.cpp index 6c6cdd934cea8..076a75a26619b 100644 --- a/mlir/examples/toy/Ch4/mlir/Dialect.cpp +++ b/mlir/examples/toy/Ch4/mlir/Dialect.cpp @@ -333,7 +333,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() { /// Set the callee for the generic call operation, this is required by the call /// interface. void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) { - (*this)->setAttr("callee", callee.get()); + (*this)->setAttr("callee", cast(callee)); } /// Get the argument operands to the called function, this is required by the diff --git a/mlir/examples/toy/Ch5/mlir/Dialect.cpp b/mlir/examples/toy/Ch5/mlir/Dialect.cpp index 72072f9188bf3..fb7c742a01802 100644 --- a/mlir/examples/toy/Ch5/mlir/Dialect.cpp +++ b/mlir/examples/toy/Ch5/mlir/Dialect.cpp @@ -333,7 +333,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() { /// Set the callee for the generic call operation, this is required by the call /// interface. void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) { - (*this)->setAttr("callee", callee.get()); + (*this)->setAttr("callee", cast(callee)); } /// Get the argument operands to the called function, this is required by the diff --git a/mlir/examples/toy/Ch6/mlir/Dialect.cpp b/mlir/examples/toy/Ch6/mlir/Dialect.cpp index 72072f9188bf3..fb7c742a01802 100644 --- a/mlir/examples/toy/Ch6/mlir/Dialect.cpp +++ b/mlir/examples/toy/Ch6/mlir/Dialect.cpp @@ -333,7 +333,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() { /// Set the callee for the generic call operation, this is required by the call /// interface. void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) { - (*this)->setAttr("callee", callee.get()); + (*this)->setAttr("callee", cast(callee)); } /// Get the argument operands to the called function, this is required by the diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp index 7e030ffc5488c..55c44c45e0f00 100644 --- a/mlir/examples/toy/Ch7/mlir/Dialect.cpp +++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp @@ -367,7 +367,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() { /// Set the callee for the generic call operation, this is required by the call /// interface. void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) { - (*this)->setAttr("callee", callee.get()); + (*this)->setAttr("callee", cast(callee)); } /// Get the argument operands to the called function, this is required by the From 4bcd8184a093d2d9f0aad1053dbb1367891da6a5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jan 2025 10:53:01 -0800 Subject: [PATCH 102/432] [TargetLowering] Pull similar code out of the forceExpandWideMUL into a helper. NFC (#124371) These functions have similar code. One of them calculates the 2x width full product from 2 sources. The other calculates the product from 2 sources that have low and high halves. This patch introduces a new function that takes HiLHS and HiRHS as optional values. If they are not null, they will be used in the calculation of the Hi half. The Signed flag can only be set when HiLHS/HiRHS are null. --- llvm/include/llvm/CodeGen/TargetLowering.h | 9 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 186 ++++++++++-------- 2 files changed, 108 insertions(+), 87 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 861cffdc115a4..4ad2835a70404 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5503,6 +5503,15 @@ class TargetLowering : public TargetLoweringBase { bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const; + /// Calculate the product twice the width of LHS and RHS. If HiLHS/HiRHS are + /// non-null they will be included in the multiplication. The expansion works + /// by splitting the 2 inputs into 4 pieces that we can multiply and add + /// together without neding MULH or MUL_LOHI. + void forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl, bool Signed, + SDValue &Lo, SDValue &Hi, SDValue LHS, SDValue RHS, + SDValue HiLHS = SDValue(), + SDValue HiRHS = SDValue()) const; + /// forceExpandWideMUL - Unconditionally expand a MUL into either a libcall or /// brute force via a wide multiplication. The expansion works by /// attempting to do a multiplication on a wider type twice the size of the diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 0d039860b9f0f..a37ec662ce2d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10857,6 +10857,64 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const { return DAG.getSelect(dl, VT, Cond, SatVal, Result); } +void TargetLowering::forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl, + bool Signed, SDValue &Lo, SDValue &Hi, + SDValue LHS, SDValue RHS, + SDValue HiLHS, SDValue HiRHS) const { + EVT VT = LHS.getValueType(); + assert(RHS.getValueType() == VT && "Mismatching operand types"); + + assert((HiLHS && HiRHS) || (!HiLHS && !HiRHS)); + assert((!Signed || !HiLHS) && + "Signed flag should only be set when HiLHS and RiRHS are null"); + + // We'll expand the multiplication by brute force because we have no other + // options. This is a trivially-generalized version of the code from + // Hacker's Delight (itself derived from Knuth's Algorithm M from section + // 4.3.1). If Signed is set, we can use arithmetic right shifts to propagate + // sign bits while calculating the Hi half. + unsigned Bits = VT.getSizeInBits(); + unsigned HalfBits = Bits / 2; + SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT); + SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask); + SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask); + + SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL); + SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask); + + SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl); + // This is always an unsigned shift. + SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift); + + unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL; + SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift); + SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift); + + SDValue U = + DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH); + SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask); + SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift); + + SDValue V = + DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL); + SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift); + + Lo = DAG.getNode(ISD::ADD, dl, VT, TL, + DAG.getNode(ISD::SHL, dl, VT, V, Shift)); + + Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH), + DAG.getNode(ISD::ADD, dl, VT, UH, VH)); + + // If HiLHS and HiRHS are set, multiply them by the opposite low part and add + // the products to Hi. + if (HiLHS) { + Hi = DAG.getNode(ISD::ADD, dl, VT, Hi, + DAG.getNode(ISD::ADD, dl, VT, + DAG.getNode(ISD::MUL, dl, VT, HiRHS, LHS), + DAG.getNode(ISD::MUL, dl, VT, RHS, HiLHS))); + } +} + void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl, bool Signed, EVT WideVT, const SDValue LL, const SDValue LH, @@ -10877,45 +10935,7 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl, LC = RTLIB::MUL_I128; if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) { - // We'll expand the multiplication by brute force because we have no other - // options. This is a trivially-generalized version of the code from - // Hacker's Delight (itself derived from Knuth's Algorithm M from section - // 4.3.1). - EVT VT = LL.getValueType(); - unsigned Bits = VT.getSizeInBits(); - unsigned HalfBits = Bits >> 1; - SDValue Mask = - DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT); - SDValue LLL = DAG.getNode(ISD::AND, dl, VT, LL, Mask); - SDValue RLL = DAG.getNode(ISD::AND, dl, VT, RL, Mask); - - SDValue T = DAG.getNode(ISD::MUL, dl, VT, LLL, RLL); - SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask); - - SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl); - SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift); - SDValue LLH = DAG.getNode(ISD::SRL, dl, VT, LL, Shift); - SDValue RLH = DAG.getNode(ISD::SRL, dl, VT, RL, Shift); - - SDValue U = DAG.getNode(ISD::ADD, dl, VT, - DAG.getNode(ISD::MUL, dl, VT, LLH, RLL), TH); - SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask); - SDValue UH = DAG.getNode(ISD::SRL, dl, VT, U, Shift); - - SDValue V = DAG.getNode(ISD::ADD, dl, VT, - DAG.getNode(ISD::MUL, dl, VT, LLL, RLH), UL); - SDValue VH = DAG.getNode(ISD::SRL, dl, VT, V, Shift); - - SDValue W = - DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LLH, RLH), - DAG.getNode(ISD::ADD, dl, VT, UH, VH)); - Lo = DAG.getNode(ISD::ADD, dl, VT, TL, - DAG.getNode(ISD::SHL, dl, VT, V, Shift)); - - Hi = DAG.getNode(ISD::ADD, dl, VT, W, - DAG.getNode(ISD::ADD, dl, VT, - DAG.getNode(ISD::MUL, dl, VT, RH, LL), - DAG.getNode(ISD::MUL, dl, VT, RL, LH))); + forceExpandMultiply(DAG, dl, /*Signed=*/false, Lo, Hi, LL, RL, LH, RH); } else { // Attempt a libcall. SDValue Ret; @@ -10965,58 +10985,50 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl, else if (WideVT == MVT::i128) LC = RTLIB::MUL_I128; - if (LC != RTLIB::UNKNOWN_LIBCALL && getLibcallName(LC)) { - SDValue HiLHS, HiRHS; - if (Signed) { - // The high part is obtained by SRA'ing all but one of the bits of low - // part. - unsigned LoSize = VT.getFixedSizeInBits(); - SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl); - HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift); - HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift); - } else { - HiLHS = DAG.getConstant(0, dl, VT); - HiRHS = DAG.getConstant(0, dl, VT); - } - forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi); + if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) { + forceExpandMultiply(DAG, dl, Signed, Lo, Hi, LHS, RHS); return; } - // Expand the multiplication by brute force. This is a generalized-version of - // the code from Hacker's Delight (itself derived from Knuth's Algorithm M - // from section 4.3.1) combined with the Hacker's delight code - // for calculating mulhs. - unsigned Bits = VT.getSizeInBits(); - unsigned HalfBits = Bits / 2; - SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT); - SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask); - SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask); - - SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL); - SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask); - - SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl); - // This is always an unsigned shift. - SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift); - - unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL; - SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift); - SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift); - - SDValue U = - DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH); - SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask); - SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift); - - SDValue V = - DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL); - SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift); - - Lo = DAG.getNode(ISD::ADD, dl, VT, TL, - DAG.getNode(ISD::SHL, dl, VT, V, Shift)); + SDValue HiLHS, HiRHS; + if (Signed) { + // The high part is obtained by SRA'ing all but one of the bits of low + // part. + unsigned LoSize = VT.getFixedSizeInBits(); + SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl); + HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift); + HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift); + } else { + HiLHS = DAG.getConstant(0, dl, VT); + HiRHS = DAG.getConstant(0, dl, VT); + } - Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH), - DAG.getNode(ISD::ADD, dl, VT, UH, VH)); + // Attempt a libcall. + SDValue Ret; + TargetLowering::MakeLibCallOptions CallOptions; + CallOptions.setIsSigned(Signed); + CallOptions.setIsPostTypeLegalization(true); + if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) { + // Halves of WideVT are packed into registers in different order + // depending on platform endianness. This is usually handled by + // the C calling convention, but we can't defer to it in + // the legalizer. + SDValue Args[] = {LHS, HiLHS, RHS, HiRHS}; + Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first; + } else { + SDValue Args[] = {HiLHS, LHS, HiRHS, RHS}; + Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first; + } + assert(Ret.getOpcode() == ISD::MERGE_VALUES && + "Ret value is a collection of constituent nodes holding result."); + if (DAG.getDataLayout().isLittleEndian()) { + // Same as above. + Lo = Ret.getOperand(0); + Hi = Ret.getOperand(1); + } else { + Lo = Ret.getOperand(1); + Hi = Ret.getOperand(0); + } } SDValue From 5e65f430414dd9df79ca6a1056b4943110ebc14b Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Sat, 25 Jan 2025 11:48:51 -0800 Subject: [PATCH 103/432] [SLP][NFC]Add a test, producing serie of extrtactelements, building non-extendable tree --- .../X86/extracts-non-extendable.ll | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll new file mode 100644 index 0000000000000..d87c40511fcf7 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu -mattr="+aes,+avx,+cmov,+crc32,+cx16,+cx8,+fxsr,+mmx,+pclmul,+popcnt,+prfchw,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" < %s | FileCheck %s + +define void @test(i64 %v) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 [[V]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 0, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[TMP7:%.*]] = and i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[TMP9:%.*]] = and i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = and i1 [[TMP9]], false +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 0, [[TMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 0, 0 +; CHECK-NEXT: [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ult i64 0, 0 +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i1 [[TMP18]], i1 false +; CHECK-NEXT: br i1 [[TMP20]], label %[[BB_I107_PREHEADER:.*]], label %[[BB_I27_I_PREHEADER:.*]] +; CHECK: [[BB_I107_PREHEADER]]: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSROA_1278_10_EXTRACT_SHIFT83_I1622_1:%.*]] = xor i64 0, [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = xor <2 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <2 x i64> splat (i64 1), [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = and <2 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq <2 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: ret void +; CHECK: [[BB_I27_I_PREHEADER]]: +; CHECK-NEXT: unreachable +; +bb: + %.sroa.82529.14.insert.insert = or i64 0, 0 + %.sroa.02528.sroa.0.0.insert.insert = or i64 %v, 0 + %0 = icmp eq i64 0, %.sroa.02528.sroa.0.0.insert.insert + %1 = icmp eq i64 0, 0 + %2 = and i1 %0, %1 + %3 = icmp eq i64 0, 0 + %4 = and i1 %2, %3 + %5 = icmp eq i64 0, 0 + %6 = and i1 %4, %5 + %7 = and i1 %6, false + %8 = icmp eq i64 0, %.sroa.02528.sroa.0.0.insert.insert + %9 = and i1 %7, %8 + %10 = icmp eq i64 0, 0 + %11 = and i1 %9, %10 + %12 = icmp eq i64 0, 0 + %13 = and i1 %11, %12 + %14 = icmp eq i64 0, 0 + %15 = and i1 %13, %14 + %16 = icmp ult i64 0, 0 + %17 = select i1 %16, i1 %15, i1 false + br i1 %17, label %bb.i107.preheader, label %bb.i27.i.preheader + +bb.i107.preheader: ; preds = %bb + %.sroa.1278.10.extract.shift83.i1622.1 = xor i64 0, %.sroa.82529.14.insert.insert + %.sroa.076.2.extract.shift80.i1619.4 = xor i64 0, %.sroa.02528.sroa.0.0.insert.insert + %.sroa.071.2.extract.shift86.i1625.4 = or i64 %.sroa.076.2.extract.shift80.i1619.4, 0 + %.sroa.1278.10.extract.shift83.i1622.7 = xor i64 0, %.sroa.82529.14.insert.insert + %.sroa.12.10.extract.shift89.i1634.7 = or i64 %.sroa.1278.10.extract.shift83.i1622.7, 0 + %.sroa.02756.2.extract.shift6530 = or i64 %.sroa.071.2.extract.shift86.i1625.4, 1 + %18 = and i64 %.sroa.02756.2.extract.shift6530, 0 + %19 = icmp eq i64 %18, 0 + %20 = or i64 1, %.sroa.12.10.extract.shift89.i1634.7 + %21 = and i64 %20, 0 + %22 = icmp eq i64 %21, 0 + ret void + +bb.i27.i.preheader: ; preds = %bb + unreachable +} + From e5b0132d157ad4c9a502dc8c4a61a3a3c83646c2 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sat, 25 Jan 2025 21:15:10 +0000 Subject: [PATCH 104/432] SCEV: add samesign tests for exit-limit computation (#124304) As the tests demonstrate, computeExitLimitFromICmp needs no additional changes to compute exit limits from an icmp with samesign. --- .../ScalarEvolution/exit-count-non-strict.ll | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll index f7a18c77a82c8..1e15d2d0d6461 100644 --- a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll +++ b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll @@ -30,6 +30,35 @@ exit: ret void } +define void @le_from_zero(i32 %M, i32 %N) { +; CHECK-LABEL: 'le_from_zero' +; CHECK-NEXT: Determining loop execution counts for: @le_from_zero +; CHECK-NEXT: Loop %loop: backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))) +; CHECK-NEXT: exit count for loop: (1 + (zext i32 %M to i64)) +; CHECK-NEXT: exit count for latch: %N +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 4294967295 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))) +; CHECK-NEXT: symbolic max exit count for loop: (1 + (zext i32 %M to i64)) +; CHECK-NEXT: symbolic max exit count for latch: %N +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add nuw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + define void @ule_from_one(i32 %M, i32 %N) { ; CHECK-LABEL: 'ule_from_one' ; CHECK-NEXT: Determining loop execution counts for: @ule_from_one @@ -59,6 +88,35 @@ exit: ret void } +define void @le_from_one(i32 %M, i32 %N) { +; CHECK-LABEL: 'le_from_one' +; CHECK-NEXT: Determining loop execution counts for: @le_from_one +; CHECK-NEXT: Loop %loop: backedge-taken count is (%M umin_seq (-1 + %N)) +; CHECK-NEXT: exit count for loop: %M +; CHECK-NEXT: exit count for latch: (-1 + %N) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 -1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (%M umin_seq (-1 + %N)) +; CHECK-NEXT: symbolic max exit count for loop: %M +; CHECK-NEXT: symbolic max exit count for latch: (-1 + %N) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add nuw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + define void @ule_from_unknown(i32 %M, i32 %N, i32 %S) { ; CHECK-LABEL: 'ule_from_unknown' ; CHECK-NEXT: Determining loop execution counts for: @ule_from_unknown @@ -133,6 +191,51 @@ exit: ret void } +define void @le_from_zero_no_nuw(i32 %M, i32 %N) { +; CHECK-LABEL: 'le_from_zero_no_nuw' +; CHECK-NEXT: Determining loop execution counts for: @le_from_zero_no_nuw +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for loop: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated exit count for loop: (1 + (zext i32 %M to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: exit count for latch: %N +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 -1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is %N +; CHECK-NEXT: symbolic max exit count for loop: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated symbolic max exit count for loop: (1 + (zext i32 %M to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: symbolic max exit count for latch: %N +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4294967295 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + define void @sle_from_int_min(i32 %M, i32 %N) { ; CHECK-LABEL: 'sle_from_int_min' ; CHECK-NEXT: Determining loop execution counts for: @sle_from_int_min @@ -162,6 +265,35 @@ exit: ret void } +define void @le_from_int_min(i32 %M, i32 %N) { +; CHECK-LABEL: 'le_from_int_min' +; CHECK-NEXT: Determining loop execution counts for: @le_from_int_min +; CHECK-NEXT: Loop %loop: backedge-taken count is ((-2147483647 + (2147483647 umax %M)) umin_seq (-2147483648 + %N)) +; CHECK-NEXT: exit count for loop: (-2147483647 + (2147483647 umax %M)) +; CHECK-NEXT: exit count for latch: (-2147483648 + %N) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 -2147483648 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is ((-2147483647 + (2147483647 umax %M)) umin_seq (-2147483648 + %N)) +; CHECK-NEXT: symbolic max exit count for loop: (-2147483647 + (2147483647 umax %M)) +; CHECK-NEXT: symbolic max exit count for latch: (-2147483648 + %N) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + br label %loop + +loop: + %iv = phi i32 [ u0x80000000, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + define void @sle_from_int_min_plus_one(i32 %M, i32 %N) { ; CHECK-LABEL: 'sle_from_int_min_plus_one' ; CHECK-NEXT: Determining loop execution counts for: @sle_from_int_min_plus_one @@ -191,6 +323,35 @@ exit: ret void } +define void @le_from_int_min_plus_one(i32 %M, i32 %N) { +; CHECK-LABEL: 'le_from_int_min_plus_one' +; CHECK-NEXT: Determining loop execution counts for: @le_from_int_min_plus_one +; CHECK-NEXT: Loop %loop: backedge-taken count is ((-2147483648 + (-2147483648 umax %M)) umin_seq (2147483647 + %N)) +; CHECK-NEXT: exit count for loop: (-2147483648 + (-2147483648 umax %M)) +; CHECK-NEXT: exit count for latch: (2147483647 + %N) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 2147483647 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is ((-2147483648 + (-2147483648 umax %M)) umin_seq (2147483647 + %N)) +; CHECK-NEXT: symbolic max exit count for loop: (-2147483648 + (-2147483648 umax %M)) +; CHECK-NEXT: symbolic max exit count for latch: (2147483647 + %N) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + br label %loop + +loop: + %iv = phi i32 [ u0x80000001, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + define void @sle_from_unknown(i32 %M, i32 %N, i32 %S) { ; CHECK-LABEL: 'sle_from_unknown' ; CHECK-NEXT: Determining loop execution counts for: @sle_from_unknown @@ -220,6 +381,35 @@ exit: ret void } +define void @le_from_unknown(i32 %M, i32 %N, i32 %S) { +; CHECK-LABEL: 'le_from_unknown' +; CHECK-NEXT: Determining loop execution counts for: @le_from_unknown +; CHECK-NEXT: Loop %loop: backedge-taken count is (((-1 * (zext i32 %S to i64)) + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64)))) umin_seq (zext i32 ((-1 * %S) + %N) to i64)) +; CHECK-NEXT: exit count for loop: ((-1 * (zext i32 %S to i64)) + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64)))) +; CHECK-NEXT: exit count for latch: ((-1 * %S) + %N) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 4294967295 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (((-1 * (zext i32 %S to i64)) + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64)))) umin_seq (zext i32 ((-1 * %S) + %N) to i64)) +; CHECK-NEXT: symbolic max exit count for loop: ((-1 * (zext i32 %S to i64)) + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64)))) +; CHECK-NEXT: symbolic max exit count for latch: ((-1 * %S) + %N) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %S, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + define void @sle_from_int_min_no_nsw(i32 %M, i32 %N) { ; CHECK-LABEL: 'sle_from_int_min_no_nsw' ; CHECK-NEXT: Determining loop execution counts for: @sle_from_int_min_no_nsw @@ -264,3 +454,48 @@ latch: exit: ret void } + +define void @le_from_int_min_no_nuw_nsw(i32 %M, i32 %N) { +; CHECK-LABEL: 'le_from_int_min_no_nuw_nsw' +; CHECK-NEXT: Determining loop execution counts for: @le_from_int_min_no_nuw_nsw +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: exit count for loop: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated exit count for loop: (-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64)))) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: exit count for latch: (-2147483648 + %N) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 -1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-2147483648 + %N) +; CHECK-NEXT: symbolic max exit count for loop: ***COULDNOTCOMPUTE*** +; CHECK-NEXT: predicated symbolic max exit count for loop: (-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64)))) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: symbolic max exit count for latch: (-2147483648 + %N) +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64)))) umin_seq (zext i32 (-2147483648 + %N) to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 2147483648 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64)))) umin_seq (zext i32 (-2147483648 + %N) to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {-2147483648,+,1}<%loop> Added Flags: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ u0x80000000, %entry ], [ %iv.next, %latch ] + %cmp1 = icmp samesign ule i32 %iv, %M + br i1 %cmp1, label %latch, label %exit + +latch: + %iv.next = add i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} From 89f2fee9f80658650524ba4fc12f01409e45000e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 13:17:15 -0800 Subject: [PATCH 105/432] [InstCombine] Add test for incorrect retention of Range attribute in fshl --- llvm/test/Transforms/InstCombine/fsh.ll | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll index 236c69e7a5bcb..34648d586300f 100644 --- a/llvm/test/Transforms/InstCombine/fsh.ll +++ b/llvm/test/Transforms/InstCombine/fsh.ll @@ -1068,3 +1068,18 @@ entry: %res = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> zeroinitializer, <2 x i31> %y) ret <2 x i31> %res } + +define i8 @fshl_range_trunc(i1 %x) { +; CHECK-LABEL: @fshl_range_trunc( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[X:%.*]] to i32 +; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[ZEXT]], 126 +; CHECK-NEXT: [[FSHL:%.*]] = call range(i32 -4, 2) i32 @llvm.fshl.i32(i32 [[OR]], i32 -2, i32 1) +; CHECK-NEXT: [[TR:%.*]] = trunc nuw i32 [[FSHL]] to i8 +; CHECK-NEXT: ret i8 [[TR]] +; + %zext = zext i1 %x to i32 + %or = or disjoint i32 %zext, -2 + %fshl = call range(i32 -4, 2) i32 @llvm.fshl.i32(i32 %or, i32 %or, i32 1) + %tr = trunc nsw i32 %fshl to i8 + ret i8 %tr +} From 77c325b646301e394bcd89c2980b4c2da8af49cd Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Sat, 25 Jan 2025 22:20:34 +0100 Subject: [PATCH 106/432] [LLD][COFF] Keep hasData true in NullChunk constructor (#124368) `NullChunk` instances do write data, even if it's always zero. Setting `hasData` to false causes `Writer::assignAddresses` to ignore them when calculating `rawSize`. This typically isn't an issue, as null chunks are usually positioned within a section, and later chunks adjust the size accordingly. However, on ARM64EC, the auxiliary IAT is placed at the end of the `.rdata` section and terminates with a null chunk. As a result, `rawSize` is never updated to account for it, and space for the null chunk is not allocated. Consequently, when `NullChunk::writeTo` is called, it receives an invalid pointer - either pointing to the next section or beyond the allocated buffer. --- lld/COFF/DLL.cpp | 1 - lld/test/COFF/arm64ec-import.test | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp index 6a3f8eb21e847..ae3a8047b7008 100644 --- a/lld/COFF/DLL.cpp +++ b/lld/COFF/DLL.cpp @@ -132,7 +132,6 @@ class ImportDirectoryChunk : public NonSectionChunk { class NullChunk : public NonSectionChunk { public: explicit NullChunk(size_t n, uint32_t align) : size(n) { - hasData = false; setAlignment(align); } explicit NullChunk(COFFLinkerContext &ctx) diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test index 033c27884be02..bb2b772081d59 100644 --- a/lld/test/COFF/arm64ec-import.test +++ b/lld/test/COFF/arm64ec-import.test @@ -160,6 +160,19 @@ BASERELOC-NEXT: Type: DIR64 BASERELOC-NEXT: Address: 0x5020 BASERELOC-NEXT: } + +Build with -filealign:8 to enable precise size checking. + +RUN: lld-link -machine:arm64ec -dll -noentry -out:out-size.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \ +RUN: test.obj test-arm64ec.lib test2-arm64ec.lib -filealign:8 + +RUN: llvm-readobj --headers out-size.dll | FileCheck --check-prefix=RDATA-HEADER %s + +RDATA-HEADER: Name: .rdata (2E 72 64 61 74 61 00 00) +RDATA-HEADER-NEXT: VirtualSize: 0x2030 +RDATA-HEADER-NEXT: VirtualAddress: 0x3000 +RDATA-HEADER-NEXT: RawDataSize: 8240 + #--- test.s .section .test, "r" .globl arm64ec_data_sym From 2131115be5b9d8b39af80973d9b64c0adc41d38d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 13:35:11 -0800 Subject: [PATCH 107/432] [InstCombine] Drop Range attribute when simplifying 'fshl' based on demanded bits (#124429) When simplifying operands based on demanded bits, the return value range of llvm.fshl might change. Keeping the Range attribute might cause llvm.fshl to generate a poison and lead to miscompile. Drop the Range attribute similar to `dropPosonGeneratingFlags` elsewhere. Fix #124387 --- .../InstCombine/InstCombineSimplifyDemanded.cpp | 9 ++++++--- llvm/test/Transforms/InstCombine/fsh.ll | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 943598a30f040..2c8939b5a0514 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1039,11 +1039,14 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt)); APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt)); if (I->getOperand(0) != I->getOperand(1)) { - if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, - Depth + 1, Q) || + if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1, + Q) || SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1, - Q)) + Q)) { + // Range attribute may no longer hold. + I->dropPoisonGeneratingReturnAttributes(); return I; + } } else { // fshl is a rotate // Avoid converting rotate into funnel shift. // Only simplify if one operand is constant. diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll index 34648d586300f..3ff4f9a2abf33 100644 --- a/llvm/test/Transforms/InstCombine/fsh.ll +++ b/llvm/test/Transforms/InstCombine/fsh.ll @@ -1069,11 +1069,12 @@ entry: ret <2 x i31> %res } +;; Issue #124387 Range attribute no longer holds after operands changed. define i8 @fshl_range_trunc(i1 %x) { ; CHECK-LABEL: @fshl_range_trunc( ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[X:%.*]] to i32 ; CHECK-NEXT: [[OR:%.*]] = or disjoint i32 [[ZEXT]], 126 -; CHECK-NEXT: [[FSHL:%.*]] = call range(i32 -4, 2) i32 @llvm.fshl.i32(i32 [[OR]], i32 -2, i32 1) +; CHECK-NEXT: [[FSHL:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR]], i32 -2, i32 1) ; CHECK-NEXT: [[TR:%.*]] = trunc nuw i32 [[FSHL]] to i8 ; CHECK-NEXT: ret i8 [[TR]] ; From 1395cd015f2edf26f8c2567870183d63f4fdd753 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 25 Jan 2025 21:55:15 +0000 Subject: [PATCH 108/432] [VPlan] Support multi-exit loops in HCFG builder. Update HCFG construction to support multi-exit loops. If there is no unique exit block, map the middle block of the initial plan to the exit block from the latch. This further unifies HCFG construction and prepares for use to also build an initial VPlan (VPlan0) for inner loops. Effectively NFC as this isn't used on the default code path yet. --- .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 60 +++++++--- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 109 ++++++++++++++++++ 2 files changed, 155 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 32723e5db9c45..5a2e5d7cfee48 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -350,10 +350,25 @@ void PlainCFGBuilder::buildPlainCFG() { // new vector preheader); here we're interested in setting BB2VPBB to the // latter. BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB; - BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock(); Loop2Region[LI->getLoopFor(TheLoop->getHeader())] = TheRegion; - assert(LoopExitBB && "Loops with multiple exits are not supported."); - BB2VPBB[LoopExitBB] = cast(TheRegion->getSingleSuccessor()); + BasicBlock *ExitBB = TheLoop->getUniqueExitBlock(); + if (!ExitBB) { + // If there is no unique exit block, we must exit via the latch. This exit + // is mapped to the middle block in the input plan. + BasicBlock *Latch = TheLoop->getLoopLatch(); + auto *Br = cast(Latch->getTerminator()); + if (TheLoop->contains(Br->getSuccessor(0))) { + assert(!TheLoop->contains(Br->getSuccessor(1)) && + "latch must exit the loop"); + ExitBB = Br->getSuccessor(1); + } else { + assert(!TheLoop->contains(Br->getSuccessor(0)) && + "latch must exit the loop"); + ExitBB = Br->getSuccessor(0); + } + } + assert(ExitBB && "Must have a unique exit block or also exit via the latch."); + BB2VPBB[ExitBB] = cast(TheRegion->getSingleSuccessor()); // The existing vector region's entry and exiting VPBBs correspond to the loop // header and latch. @@ -423,21 +438,38 @@ void PlainCFGBuilder::buildPlainCFG() { // representing the condition bit in VPlan (which may be in another VPBB). assert(IRDef2VPValue.contains(BI->getCondition()) && "Missing condition bit in IRDef2VPValue!"); - VPBasicBlock *Successor0 = getOrCreateVPBB(BI->getSuccessor(0)); - VPBasicBlock *Successor1 = getOrCreateVPBB(BI->getSuccessor(1)); - if (!LoopForBB || BB != LoopForBB->getLoopLatch()) { - VPBB->setTwoSuccessors(Successor0, Successor1); - continue; - } - // For a latch we need to set the successor of the region rather than that - // of VPBB and it should be set to the exit, i.e., non-header successor, - // except for the top region, whose successor was set when creating VPlan's - // skeleton. - if (TheRegion != Region) { + + BasicBlock *IRSucc0 = BI->getSuccessor(0); + BasicBlock *IRSucc1 = BI->getSuccessor(1); + VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0); + VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1); + if (BB == LoopForBB->getLoopLatch()) { + // For a latch we need to set the successor of the region rather than that + // of VPBB and it should be set to the exit, i.e., non-header successor, + // except for the top region, whose successor was set when creating + // VPlan's skeleton. + assert(TheRegion != Region && + "Latch of the top region should have been handled earlier"); Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1 : Successor0); Region->setExiting(VPBB); + continue; } + + // Don't connect any blocks outside the current loop except the latch for + // now. The latch is handled above. + if (LoopForBB) { + if (!LoopForBB->contains(IRSucc0)) { + VPBB->setOneSuccessor(Successor1); + continue; + } + if (!LoopForBB->contains(IRSucc1)) { + VPBB->setOneSuccessor(Successor0); + continue; + } + } + + VPBB->setTwoSuccessors(Successor0, Successor1); } // 2. The whole CFG has been built at this point so all the input Values must diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index dcdaf008e10fe..d787a6c977194 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -234,5 +234,114 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { EXPECT_EQ(VecBB->end(), Iter); } +TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoopMultiExit) { + const char *ModuleString = + "define void @f(ptr %A, i64 %N) {\n" + "entry:\n" + " br label %loop.header\n" + "loop.header:\n" + " %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]\n" + " %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n" + " %l1 = load i32, ptr %arr.idx, align 4\n" + " %c = icmp eq i32 %l1, 0\n" + " br i1 %c, label %exit.1, label %loop.latch\n" + "loop.latch:\n" + " %res = add i32 %l1, 10\n" + " store i32 %res, ptr %arr.idx, align 4\n" + " %iv.next = add i64 %iv, 1\n" + " %exitcond = icmp ne i64 %iv.next, %N\n" + " br i1 %exitcond, label %loop.header, label %exit.2\n" + "exit.1:\n" + " ret void\n" + "exit.2:\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("f"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + // Add an external value to check we do not print the list of external values, + // as this is not required with the new printing. + Plan->getOrAddLiveIn(&*F->arg_begin()); + std::string FullDump; + raw_string_ostream OS(FullDump); + Plan->printDOT(OS); + const char *ExpectedStr = R"(digraph VPlan { +graph [labelloc=t, fontsize=30; label="Vectorization Plan\n for UF\>=1\nLive-in vp\<%0\> = vector-trip-count\nLive-in ir\<%N\> = original trip-count\n"] +node [shape=rect, fontname=Courier, fontsize=30] +edge [fontname=Courier, fontsize=30] +compound=true + N0 [label = + "ir-bb\:\l" + + "Successor(s): vector.ph\l" + ] + N0 -> N1 [ label=""] + N1 [label = + "vector.ph:\l" + + "Successor(s): vector loop\l" + ] + N1 -> N2 [ label="" lhead=cluster_N3] + subgraph cluster_N3 { + fontname=Courier + label="\ vector loop" + N2 [label = + "vector.body:\l" + + " WIDEN-PHI ir\<%iv\> = phi ir\<0\>, ir\<%iv.next\>\l" + + " EMIT ir\<%arr.idx\> = getelementptr ir\<%A\>, ir\<%iv\>\l" + + " EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" + + " EMIT ir\<%c\> = icmp ir\<%l1\>, ir\<0\>\l" + + "Successor(s): loop.latch\l" + ] + N2 -> N4 [ label=""] + N4 [label = + "loop.latch:\l" + + " EMIT ir\<%res\> = add ir\<%l1\>, ir\<10\>\l" + + " EMIT store ir\<%res\>, ir\<%arr.idx\>\l" + + " EMIT ir\<%iv.next\> = add ir\<%iv\>, ir\<1\>\l" + + " EMIT ir\<%exitcond\> = icmp ir\<%iv.next\>, ir\<%N\>\l" + + "Successor(s): vector.latch\l" + ] + N4 -> N5 [ label=""] + N5 [label = + "vector.latch:\l" + + "No successors\l" + ] + } + N5 -> N6 [ label="" ltail=cluster_N3] + N6 [label = + "middle.block:\l" + + " EMIT vp\<%cmp.n\> = icmp eq ir\<%N\>, vp\<%0\>\l" + + " EMIT branch-on-cond vp\<%cmp.n\>\l" + + "Successor(s): ir-bb\, scalar.ph\l" + ] + N6 -> N7 [ label="T"] + N6 -> N8 [ label="F"] + N7 [label = + "ir-bb\:\l" + + "No successors\l" + ] + N8 [label = + "scalar.ph:\l" + + "Successor(s): ir-bb\\l" + ] + N8 -> N9 [ label=""] + N9 [label = + "ir-bb\:\l" + + " IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]\l" + + " IR %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\l" + + " IR %l1 = load i32, ptr %arr.idx, align 4\l" + + " IR %c = icmp eq i32 %l1, 0\l" + + "No successors\l" + ] +} +)"; + EXPECT_EQ(ExpectedStr, FullDump); +#endif +} + } // namespace } // namespace llvm From 563c7c5539f05e7f8cbb42565c1f24466019f38b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 14:05:01 -0800 Subject: [PATCH 109/432] [clang] Migrate away from PointerUnion::dyn_cast (NFC) (#124425) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast This patch migrates uses of PointerUnion::dyn_cast to dyn_cast_if_present (see the definition of PointerUnion::dyn_cast). Note that we cannot use dyn_cast in any of the migrations in this patch; placing assert(!X.isNull()); just before any of dyn_cast_if_present in this patch triggers some failure in check-clang. --- clang/include/clang/AST/APValue.h | 5 +- clang/include/clang/AST/ASTContext.h | 2 +- clang/include/clang/AST/Decl.h | 2 +- clang/include/clang/AST/DeclBase.h | 2 +- clang/include/clang/AST/DeclTemplate.h | 9 ++- clang/include/clang/AST/Expr.h | 4 +- clang/include/clang/AST/ExprCXX.h | 6 +- clang/include/clang/Basic/IdentifierTable.h | 2 +- clang/include/clang/Lex/Preprocessor.h | 10 +-- clang/lib/APINotes/APINotesManager.cpp | 4 +- clang/lib/AST/Decl.cpp | 71 ++++++++++--------- clang/lib/AST/DeclCXX.cpp | 10 +-- clang/lib/AST/DeclTemplate.cpp | 4 +- clang/lib/AST/TemplateName.cpp | 11 +-- .../Frontend/SerializedDiagnosticPrinter.cpp | 2 +- clang/lib/Sema/SemaDecl.cpp | 6 +- clang/tools/libclang/CIndexDiagnostic.cpp | 3 +- 17 files changed, 83 insertions(+), 70 deletions(-) diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h index 833a78c77871d..9999a30c51ade 100644 --- a/clang/include/clang/AST/APValue.h +++ b/clang/include/clang/AST/APValue.h @@ -161,8 +161,9 @@ class APValue { template T get() const { return cast(Ptr); } - template - T dyn_cast() const { return Ptr.dyn_cast(); } + template T dyn_cast() const { + return dyn_cast_if_present(Ptr); + } void *getOpaqueValue() const; diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 4e9b961688d55..65be782c1ba43 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -769,7 +769,7 @@ class ASTContext : public RefCountedBase { /// pool. DeclListNode *AllocateDeclListNode(clang::NamedDecl *ND) { if (DeclListNode *Alloc = ListNodeFreeList) { - ListNodeFreeList = Alloc->Rest.dyn_cast(); + ListNodeFreeList = dyn_cast_if_present(Alloc->Rest); Alloc->D = ND; Alloc->Rest = nullptr; return Alloc; diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index d01681483a918..16403774e72b3 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -4035,7 +4035,7 @@ class EnumDecl : public TagDecl { /// Return the type source info for the underlying integer type, /// if no type source info exists, return 0. TypeSourceInfo *getIntegerTypeSourceInfo() const { - return IntegerType.dyn_cast(); + return dyn_cast_if_present(IntegerType); } /// Retrieve the source range that covers the underlying type if diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 573b46a2321c5..2c0c3a8dc2f9d 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -1391,7 +1391,7 @@ class DeclContextLookupResult { const_iterator end() const { return iterator(); } bool empty() const { return Result.isNull(); } - bool isSingleResult() const { return Result.dyn_cast(); } + bool isSingleResult() const { return isa_and_present(Result); } reference front() const { return *begin(); } // Find the first declaration of the given type in the list. Note that this diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 8c2da97c07a3b..caaa47d0a297c 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -2009,7 +2009,8 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl, /// Retrieve the template argument list as written in the sources, /// if any. const ASTTemplateArgumentListInfo *getTemplateArgsAsWritten() const { - if (auto *Info = ExplicitInfo.dyn_cast()) + if (auto *Info = + dyn_cast_if_present(ExplicitInfo)) return Info->TemplateArgsAsWritten; return cast(ExplicitInfo); } @@ -2041,7 +2042,8 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl, /// Gets the location of the template keyword, if present. SourceLocation getTemplateKeywordLoc() const { - if (auto *Info = ExplicitInfo.dyn_cast()) + if (auto *Info = + dyn_cast_if_present(ExplicitInfo)) return Info->TemplateKeywordLoc; return SourceLocation(); } @@ -2786,7 +2788,8 @@ class VarTemplateSpecializationDecl : public VarDecl, /// Set the template argument list as written in the sources. void setTemplateArgsAsWritten(const ASTTemplateArgumentListInfo *ArgsWritten) { - if (auto *Info = ExplicitInfo.dyn_cast()) + if (auto *Info = + dyn_cast_if_present(ExplicitInfo)) Info->TemplateArgsAsWritten = ArgsWritten; else ExplicitInfo = ArgsWritten; diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 708c8656decbe..7be4022649329 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -5180,7 +5180,7 @@ class InitListExpr : public Expr { /// than there are initializers in the list, specifies an expression to be /// used for value initialization of the rest of the elements. Expr *getArrayFiller() { - return ArrayFillerOrUnionFieldInit.dyn_cast(); + return dyn_cast_if_present(ArrayFillerOrUnionFieldInit); } const Expr *getArrayFiller() const { return const_cast(this)->getArrayFiller(); @@ -5205,7 +5205,7 @@ class InitListExpr : public Expr { /// union. However, a designated initializer can specify the /// initialization of a different field within the union. FieldDecl *getInitializedFieldInUnion() { - return ArrayFillerOrUnionFieldInit.dyn_cast(); + return dyn_cast_if_present(ArrayFillerOrUnionFieldInit); } const FieldDecl *getInitializedFieldInUnion() const { return const_cast(this)->getInitializedFieldInUnion(); diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 4cec89c979f77..aa10945addf78 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -5026,11 +5026,11 @@ class CXXParenListInitExpr final void setArrayFiller(Expr *E) { ArrayFillerOrUnionFieldInit = E; } Expr *getArrayFiller() { - return ArrayFillerOrUnionFieldInit.dyn_cast(); + return dyn_cast_if_present(ArrayFillerOrUnionFieldInit); } const Expr *getArrayFiller() const { - return ArrayFillerOrUnionFieldInit.dyn_cast(); + return dyn_cast_if_present(ArrayFillerOrUnionFieldInit); } void setInitializedFieldInUnion(FieldDecl *FD) { @@ -5038,7 +5038,7 @@ class CXXParenListInitExpr final } FieldDecl *getInitializedFieldInUnion() { - return ArrayFillerOrUnionFieldInit.dyn_cast(); + return dyn_cast_if_present(ArrayFillerOrUnionFieldInit); } const FieldDecl *getInitializedFieldInUnion() const { diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h index 33d1cdb46f108..e5e6be3c96600 100644 --- a/clang/include/clang/Basic/IdentifierTable.h +++ b/clang/include/clang/Basic/IdentifierTable.h @@ -1008,7 +1008,7 @@ class Selector { } const IdentifierInfo *getAsIdentifierInfo() const { - return InfoPtr.getPointer().dyn_cast(); + return dyn_cast_if_present(InfoPtr.getPointer()); } MultiKeywordSelector *getMultiKeywordSelector() const { diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8ddc5b56eedbd..416f403c29841 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -856,7 +856,7 @@ class Preprocessor { !PP.CurSubmoduleState->VisibleModules.getGeneration()) return nullptr; - auto *Info = State.dyn_cast(); + auto *Info = dyn_cast_if_present(State); if (!Info) { Info = new (PP.getPreprocessorAllocator()) ModuleMacroInfo(cast(State)); @@ -885,18 +885,18 @@ class Preprocessor { } ~MacroState() { - if (auto *Info = State.dyn_cast()) + if (auto *Info = dyn_cast_if_present(State)) Info->~ModuleMacroInfo(); } MacroDirective *getLatest() const { - if (auto *Info = State.dyn_cast()) + if (auto *Info = dyn_cast_if_present(State)) return Info->MD; return cast(State); } void setLatest(MacroDirective *MD) { - if (auto *Info = State.dyn_cast()) + if (auto *Info = dyn_cast_if_present(State)) Info->MD = MD; else State = MD; @@ -940,7 +940,7 @@ class Preprocessor { void setOverriddenMacros(Preprocessor &PP, ArrayRef Overrides) { - auto *Info = State.dyn_cast(); + auto *Info = dyn_cast_if_present(State); if (!Info) { if (Overrides.empty()) return; diff --git a/clang/lib/APINotes/APINotesManager.cpp b/clang/lib/APINotes/APINotesManager.cpp index 70d96c735503f..7f8a126ffaa03 100644 --- a/clang/lib/APINotes/APINotesManager.cpp +++ b/clang/lib/APINotes/APINotesManager.cpp @@ -56,7 +56,7 @@ APINotesManager::APINotesManager(SourceManager &SM, const LangOptions &LangOpts) APINotesManager::~APINotesManager() { // Free the API notes readers. for (const auto &Entry : Readers) { - if (auto Reader = Entry.second.dyn_cast()) + if (auto Reader = dyn_cast_if_present(Entry.second)) delete Reader; } @@ -381,7 +381,7 @@ APINotesManager::findAPINotes(SourceLocation Loc) { } // We have the answer. - if (auto Reader = Known->second.dyn_cast()) + if (auto Reader = dyn_cast_if_present(Known->second)) Results.push_back(Reader); break; } diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 5ce03ce20d284..74bcb618f2950 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -2447,7 +2447,7 @@ bool VarDecl::isOutOfLine() const { } void VarDecl::setInit(Expr *I) { - if (auto *Eval = Init.dyn_cast()) { + if (auto *Eval = dyn_cast_if_present(Init)) { Eval->~EvaluatedStmt(); getASTContext().Deallocate(Eval); } @@ -2527,7 +2527,7 @@ bool VarDecl::isUsableInConstantExpressions(const ASTContext &Context) const { /// form, which contains extra information on the evaluated value of the /// initializer. EvaluatedStmt *VarDecl::ensureEvaluatedStmt() const { - auto *Eval = Init.dyn_cast(); + auto *Eval = dyn_cast_if_present(Init); if (!Eval) { // Note: EvaluatedStmt contains an APValue, which usually holds // resources not allocated from the ASTContext. We need to do some @@ -2541,7 +2541,7 @@ EvaluatedStmt *VarDecl::ensureEvaluatedStmt() const { } EvaluatedStmt *VarDecl::getEvaluatedStmt() const { - return Init.dyn_cast(); + return dyn_cast_if_present(Init); } APValue *VarDecl::evaluateValue() const { @@ -2784,8 +2784,8 @@ SourceLocation VarDecl::getPointOfInstantiation() const { } VarTemplateDecl *VarDecl::getDescribedVarTemplate() const { - return getASTContext().getTemplateOrSpecializationInfo(this) - .dyn_cast(); + return dyn_cast_if_present( + getASTContext().getTemplateOrSpecializationInfo(this)); } void VarDecl::setDescribedVarTemplate(VarTemplateDecl *Template) { @@ -2875,8 +2875,8 @@ MemberSpecializationInfo *VarDecl::getMemberSpecializationInfo() const { if (isStaticDataMember()) // FIXME: Remove ? // return getASTContext().getInstantiatedFromStaticDataMember(this); - return getASTContext().getTemplateOrSpecializationInfo(this) - .dyn_cast(); + return dyn_cast_if_present( + getASTContext().getTemplateOrSpecializationInfo(this)); return nullptr; } @@ -4040,11 +4040,11 @@ FunctionDecl *FunctionDecl::getInstantiatedFromMemberFunction() const { } MemberSpecializationInfo *FunctionDecl::getMemberSpecializationInfo() const { - if (auto *MSI = - TemplateOrSpecialization.dyn_cast()) + if (auto *MSI = dyn_cast_if_present( + TemplateOrSpecialization)) return MSI; - if (auto *FTSI = TemplateOrSpecialization - .dyn_cast()) + if (auto *FTSI = dyn_cast_if_present( + TemplateOrSpecialization)) return FTSI->getMemberSpecializationInfo(); return nullptr; } @@ -4062,7 +4062,7 @@ FunctionDecl::setInstantiationOfMemberFunction(ASTContext &C, FunctionTemplateDecl *FunctionDecl::getDescribedFunctionTemplate() const { return dyn_cast_if_present( - TemplateOrSpecialization.dyn_cast()); + dyn_cast_if_present(TemplateOrSpecialization)); } void FunctionDecl::setDescribedFunctionTemplate( @@ -4181,9 +4181,9 @@ FunctionDecl::getTemplateInstantiationPattern(bool ForDefinition) const { } FunctionTemplateDecl *FunctionDecl::getPrimaryTemplate() const { - if (FunctionTemplateSpecializationInfo *Info - = TemplateOrSpecialization - .dyn_cast()) { + if (FunctionTemplateSpecializationInfo *Info = + dyn_cast_if_present( + TemplateOrSpecialization)) { return Info->getTemplate(); } return nullptr; @@ -4191,15 +4191,15 @@ FunctionTemplateDecl *FunctionDecl::getPrimaryTemplate() const { FunctionTemplateSpecializationInfo * FunctionDecl::getTemplateSpecializationInfo() const { - return TemplateOrSpecialization - .dyn_cast(); + return dyn_cast_if_present( + TemplateOrSpecialization); } const TemplateArgumentList * FunctionDecl::getTemplateSpecializationArgs() const { - if (FunctionTemplateSpecializationInfo *Info - = TemplateOrSpecialization - .dyn_cast()) { + if (FunctionTemplateSpecializationInfo *Info = + dyn_cast_if_present( + TemplateOrSpecialization)) { return Info->TemplateArguments; } return nullptr; @@ -4207,14 +4207,14 @@ FunctionDecl::getTemplateSpecializationArgs() const { const ASTTemplateArgumentListInfo * FunctionDecl::getTemplateSpecializationArgsAsWritten() const { - if (FunctionTemplateSpecializationInfo *Info - = TemplateOrSpecialization - .dyn_cast()) { + if (FunctionTemplateSpecializationInfo *Info = + dyn_cast_if_present( + TemplateOrSpecialization)) { return Info->TemplateArgumentsAsWritten; } if (DependentFunctionTemplateSpecializationInfo *Info = - TemplateOrSpecialization - .dyn_cast()) { + dyn_cast_if_present( + TemplateOrSpecialization)) { return Info->TemplateArgumentsAsWritten; } return nullptr; @@ -4239,7 +4239,8 @@ void FunctionDecl::setFunctionTemplateSpecialization( FunctionTemplateSpecializationInfo::Create( C, this, Template, TSK, TemplateArgs, TemplateArgsAsWritten, PointOfInstantiation, - TemplateOrSpecialization.dyn_cast()); + dyn_cast_if_present( + TemplateOrSpecialization)); TemplateOrSpecialization = Info; Template->addSpecialization(Info, InsertPos); } @@ -4256,8 +4257,8 @@ void FunctionDecl::setDependentTemplateSpecialization( DependentFunctionTemplateSpecializationInfo * FunctionDecl::getDependentSpecializationInfo() const { - return TemplateOrSpecialization - .dyn_cast(); + return dyn_cast_if_present( + TemplateOrSpecialization); } DependentFunctionTemplateSpecializationInfo * @@ -4288,12 +4289,13 @@ TemplateSpecializationKind FunctionDecl::getTemplateSpecializationKind() const { // For a function template specialization, query the specialization // information object. if (FunctionTemplateSpecializationInfo *FTSInfo = - TemplateOrSpecialization - .dyn_cast()) + dyn_cast_if_present( + TemplateOrSpecialization)) return FTSInfo->getTemplateSpecializationKind(); if (MemberSpecializationInfo *MSInfo = - TemplateOrSpecialization.dyn_cast()) + dyn_cast_if_present( + TemplateOrSpecialization)) return MSInfo->getTemplateSpecializationKind(); // A dependent function template specialization is an explicit specialization, @@ -4331,15 +4333,16 @@ FunctionDecl::getTemplateSpecializationKindForInstantiation() const { // of A::f, and that A::f should be implicitly instantiated // from A::f if a definition is needed. if (FunctionTemplateSpecializationInfo *FTSInfo = - TemplateOrSpecialization - .dyn_cast()) { + dyn_cast_if_present( + TemplateOrSpecialization)) { if (auto *MSInfo = FTSInfo->getMemberSpecializationInfo()) return MSInfo->getTemplateSpecializationKind(); return FTSInfo->getTemplateSpecializationKind(); } if (MemberSpecializationInfo *MSInfo = - TemplateOrSpecialization.dyn_cast()) + dyn_cast_if_present( + TemplateOrSpecialization)) return MSInfo->getTemplateSpecializationKind(); if (isa( diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 44f45898fb483..c0a4356dcb004 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -1987,7 +1987,8 @@ CXXRecordDecl *CXXRecordDecl::getInstantiatedFromMemberClass() const { } MemberSpecializationInfo *CXXRecordDecl::getMemberSpecializationInfo() const { - return TemplateOrInstantiation.dyn_cast(); + return dyn_cast_if_present( + TemplateOrInstantiation); } void @@ -2001,7 +2002,7 @@ CXXRecordDecl::setInstantiationOfMemberClass(CXXRecordDecl *RD, } ClassTemplateDecl *CXXRecordDecl::getDescribedClassTemplate() const { - return TemplateOrInstantiation.dyn_cast(); + return dyn_cast_if_present(TemplateOrInstantiation); } void CXXRecordDecl::setDescribedClassTemplate(ClassTemplateDecl *Template) { @@ -2045,7 +2046,7 @@ const CXXRecordDecl *CXXRecordDecl::getTemplateInstantiationPattern() const { // specialization from which it was instantiated. if (auto *TD = dyn_cast(this)) { auto From = TD->getInstantiatedFrom(); - if (auto *CTD = From.dyn_cast()) { + if (auto *CTD = dyn_cast_if_present(From)) { while (auto *NewCTD = CTD->getInstantiatedFromMemberTemplate()) { if (NewCTD->isMemberSpecialization()) break; @@ -2054,7 +2055,8 @@ const CXXRecordDecl *CXXRecordDecl::getTemplateInstantiationPattern() const { return GetDefinitionOrSelf(CTD->getTemplatedDecl()); } if (auto *CTPSD = - From.dyn_cast()) { + dyn_cast_if_present( + From)) { while (auto *NewCTPSD = CTPSD->getInstantiatedFromMember()) { if (NewCTPSD->isMemberSpecialization()) break; diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 2933ba7fb8a29..926b2b26dd381 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -1077,7 +1077,7 @@ ClassTemplateSpecializationDecl::getSourceRange() const { } void ClassTemplateSpecializationDecl::setExternKeywordLoc(SourceLocation Loc) { - auto *Info = ExplicitInfo.dyn_cast(); + auto *Info = dyn_cast_if_present(ExplicitInfo); if (!Info) { // Don't allocate if the location is invalid. if (Loc.isInvalid()) @@ -1091,7 +1091,7 @@ void ClassTemplateSpecializationDecl::setExternKeywordLoc(SourceLocation Loc) { void ClassTemplateSpecializationDecl::setTemplateKeywordLoc( SourceLocation Loc) { - auto *Info = ExplicitInfo.dyn_cast(); + auto *Info = dyn_cast_if_present(ExplicitInfo); if (!Info) { // Don't allocate if the location is invalid. if (Loc.isInvalid()) diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp index 7d6275caedc4f..3a1eb1ca12f45 100644 --- a/clang/lib/AST/TemplateName.cpp +++ b/clang/lib/AST/TemplateName.cpp @@ -182,7 +182,8 @@ TemplateDecl *TemplateName::getAsTemplateDecl(bool IgnoreDeduced) const { "Unexpected canonical DeducedTemplateName; Did you mean to use " "getTemplateDeclAndDefaultArgs instead?"); - return cast_if_present(Name.Storage.dyn_cast()); + return cast_if_present( + dyn_cast_if_present(Name.Storage)); } std::pair @@ -208,7 +209,7 @@ TemplateName::getTemplateDeclAndDefaultArgs() const { } std::optional TemplateName::desugar(bool IgnoreDeduced) const { - if (Decl *D = Storage.dyn_cast()) { + if (Decl *D = dyn_cast_if_present(Storage)) { if (auto *USD = dyn_cast(D)) return TemplateName(USD->getTargetDecl()); return std::nullopt; @@ -242,7 +243,7 @@ AssumedTemplateStorage *TemplateName::getAsAssumedTemplateName() const { SubstTemplateTemplateParmStorage * TemplateName::getAsSubstTemplateTemplateParm() const { if (UncommonTemplateNameStorage *uncommon = - Storage.dyn_cast()) + dyn_cast_if_present(Storage)) return uncommon->getAsSubstTemplateTemplateParm(); return nullptr; @@ -258,7 +259,7 @@ TemplateName::getAsSubstTemplateTemplateParmPack() const { } QualifiedTemplateName *TemplateName::getAsQualifiedTemplateName() const { - return Storage.dyn_cast(); + return dyn_cast_if_present(Storage); } DependentTemplateName *TemplateName::getAsDependentTemplateName() const { @@ -276,7 +277,7 @@ UsingShadowDecl *TemplateName::getAsUsingShadowDecl() const { DeducedTemplateStorage *TemplateName::getAsDeducedTemplateName() const { if (UncommonTemplateNameStorage *Uncommon = - Storage.dyn_cast()) + dyn_cast_if_present(Storage)) return Uncommon->getAsDeducedTemplateName(); return nullptr; diff --git a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp index 0887b5a504f05..131334269aa75 100644 --- a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp +++ b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp @@ -650,7 +650,7 @@ void SDiagsWriter::EmitDiagnosticMessage(FullSourceLoc Loc, PresumedLoc PLoc, Record.push_back(getStableLevel(Level)); AddLocToRecord(Loc, PLoc, Record); - if (const Diagnostic *Info = D.dyn_cast()) { + if (const Diagnostic *Info = dyn_cast_if_present(D)) { // Emit the category string lazily and get the category ID. unsigned DiagID = DiagnosticIDs::getCategoryNumberForDiag(Info->getID()); Record.push_back(getEmitCategory(DiagID)); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index ad49eac66e98e..c3ff247a6316d 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -17700,9 +17700,11 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, return PrevTagDecl; QualType EnumUnderlyingTy; - if (TypeSourceInfo *TI = EnumUnderlying.dyn_cast()) + if (TypeSourceInfo *TI = + dyn_cast_if_present(EnumUnderlying)) EnumUnderlyingTy = TI->getType().getUnqualifiedType(); - else if (const Type *T = EnumUnderlying.dyn_cast()) + else if (const Type *T = + dyn_cast_if_present(EnumUnderlying)) EnumUnderlyingTy = QualType(T, 0); // All conflicts with previous declarations are recovered by diff --git a/clang/tools/libclang/CIndexDiagnostic.cpp b/clang/tools/libclang/CIndexDiagnostic.cpp index 34792d5bdfaaf..92271d9c37f86 100644 --- a/clang/tools/libclang/CIndexDiagnostic.cpp +++ b/clang/tools/libclang/CIndexDiagnostic.cpp @@ -92,7 +92,8 @@ class CXDiagnosticRenderer : public DiagnosticNoteRenderer { void beginDiagnostic(DiagOrStoredDiag D, DiagnosticsEngine::Level Level) override { - const StoredDiagnostic *SD = D.dyn_cast(); + const StoredDiagnostic *SD = + dyn_cast_if_present(D); if (!SD) return; From 04d5608057f73cf8deb66ddaeddf2f9254fd864b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 14:05:11 -0800 Subject: [PATCH 110/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#124430) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect P to be nonnull. --- clang/lib/AST/DeclTemplate.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 926b2b26dd381..e4cb7dcb16a45 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -1801,8 +1801,7 @@ TemplateParameterList *clang::getReplacedTemplateParameterList(Decl *D) { case Decl::Kind::VarTemplateSpecialization: { const auto *VTSD = cast(D); auto P = VTSD->getSpecializedTemplateOrPartial(); - if (const auto *VTPSD = - P.dyn_cast()) + if (const auto *VTPSD = dyn_cast(P)) return VTPSD->getTemplateParameters(); return cast(P)->getTemplateParameters(); } From 19a6ac18ef3e92017db49668ee365e694157f317 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 15:18:13 -0800 Subject: [PATCH 111/432] [ELF] EhFrame: replace failOn with errOn These diagnostics are mostly reported by a thread during writeSections. In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no output even if the process exits with code 1. --- lld/ELF/EhFrame.cpp | 52 +++++++++++++++++++------------- lld/test/ELF/invalid-eh-frame2.s | 1 + lld/test/ELF/invalid-eh-frame4.s | 1 + lld/test/ELF/invalid-eh-frame6.s | 1 + 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/lld/ELF/EhFrame.cpp b/lld/ELF/EhFrame.cpp index 6e0120e14988b..5d5a7bc0ab966 100644 --- a/lld/ELF/EhFrame.cpp +++ b/lld/ELF/EhFrame.cpp @@ -41,11 +41,10 @@ class EhReader { bool hasLSDA(); private: - template void failOn(const P *loc, const Twine &msg) { + template void errOn(const P *loc, const Twine &msg) { Ctx &ctx = isec->file->ctx; - Fatal(ctx) << "corrupted .eh_frame: " << msg << "\n>>> defined in " - << isec->getObjMsg((const uint8_t *)loc - - isec->content().data()); + Err(ctx) << "corrupted .eh_frame: " << msg << "\n>>> defined in " + << isec->getObjMsg((const uint8_t *)loc - isec->content().data()); } uint8_t readByte(); @@ -62,8 +61,10 @@ class EhReader { // Read a byte and advance D by one byte. uint8_t EhReader::readByte() { - if (d.empty()) - failOn(d.data(), "unexpected end of CIE"); + if (d.empty()) { + errOn(d.data(), "unexpected end of CIE"); + return 0; + } uint8_t b = d.front(); d = d.slice(1); return b; @@ -71,15 +72,18 @@ uint8_t EhReader::readByte() { void EhReader::skipBytes(size_t count) { if (d.size() < count) - failOn(d.data(), "CIE is too small"); - d = d.slice(count); + errOn(d.data(), "CIE is too small"); + else + d = d.slice(count); } // Read a null-terminated string. StringRef EhReader::readString() { const uint8_t *end = llvm::find(d, '\0'); - if (end == d.end()) - failOn(d.data(), "corrupted CIE (failed to read string)"); + if (end == d.end()) { + errOn(d.data(), "corrupted CIE (failed to read string)"); + return {}; + } StringRef s = toStringRef(d.slice(0, end - d.begin())); d = d.slice(s.size() + 1); return s; @@ -97,7 +101,7 @@ void EhReader::skipLeb128() { if ((val & 0x80) == 0) return; } - failOn(errPos, "corrupted CIE (failed to read LEB128)"); + errOn(errPos, "corrupted CIE (failed to read LEB128)"); } static size_t getAugPSize(Ctx &ctx, unsigned enc) { @@ -121,12 +125,12 @@ static size_t getAugPSize(Ctx &ctx, unsigned enc) { void EhReader::skipAugP() { uint8_t enc = readByte(); if ((enc & 0xf0) == DW_EH_PE_aligned) - failOn(d.data() - 1, "DW_EH_PE_aligned encoding is not supported"); + return errOn(d.data() - 1, "DW_EH_PE_aligned encoding is not supported"); size_t size = getAugPSize(isec->getCtx(), enc); if (size == 0) - failOn(d.data() - 1, "unknown FDE encoding"); + return errOn(d.data() - 1, "unknown FDE encoding"); if (size >= d.size()) - failOn(d.data() - 1, "corrupted CIE"); + return errOn(d.data() - 1, "corrupted CIE"); d = d.slice(size); } @@ -141,9 +145,11 @@ bool elf::hasLSDA(const EhSectionPiece &p) { StringRef EhReader::getAugmentation() { skipBytes(8); int version = readByte(); - if (version != 1 && version != 3) - failOn(d.data() - 1, - "FDE version 1 or 3 expected, but got " + Twine(version)); + if (version != 1 && version != 3) { + errOn(d.data() - 1, + "FDE version 1 or 3 expected, but got " + Twine(version)); + return {}; + } StringRef aug = readString(); @@ -174,8 +180,10 @@ uint8_t EhReader::getFdeEncoding() { readByte(); else if (c == 'P') skipAugP(); - else if (c != 'B' && c != 'S' && c != 'G') - failOn(aug.data(), "unknown .eh_frame augmentation string: " + aug); + else if (c != 'B' && c != 'S' && c != 'G') { + errOn(aug.data(), "unknown .eh_frame augmentation string: " + aug); + break; + } } return DW_EH_PE_absptr; } @@ -191,8 +199,10 @@ bool EhReader::hasLSDA() { skipAugP(); else if (c == 'R') readByte(); - else if (c != 'B' && c != 'S' && c != 'G') - failOn(aug.data(), "unknown .eh_frame augmentation string: " + aug); + else if (c != 'B' && c != 'S' && c != 'G') { + errOn(aug.data(), "unknown .eh_frame augmentation string: " + aug); + break; + } } return false; } diff --git a/lld/test/ELF/invalid-eh-frame2.s b/lld/test/ELF/invalid-eh-frame2.s index 87ce8ede72503..01f38738519b6 100644 --- a/lld/test/ELF/invalid-eh-frame2.s +++ b/lld/test/ELF/invalid-eh-frame2.s @@ -2,6 +2,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t # RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec # CHECK: error: corrupted .eh_frame: corrupted CIE (failed to read string) # CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0x9) diff --git a/lld/test/ELF/invalid-eh-frame4.s b/lld/test/ELF/invalid-eh-frame4.s index a567bd40d73ef..60bbc7f22717c 100644 --- a/lld/test/ELF/invalid-eh-frame4.s +++ b/lld/test/ELF/invalid-eh-frame4.s @@ -2,6 +2,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t # RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s # CHECK: corrupted .eh_frame: unknown .eh_frame augmentation string: diff --git a/lld/test/ELF/invalid-eh-frame6.s b/lld/test/ELF/invalid-eh-frame6.s index 77be15f54e6b1..6888419da3e3d 100644 --- a/lld/test/ELF/invalid-eh-frame6.s +++ b/lld/test/ELF/invalid-eh-frame6.s @@ -2,6 +2,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t # RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec # CHECK: error: corrupted .eh_frame: unknown FDE encoding # CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0xe) From 0f3c2884f3ccbdbe396e4388feb8be716b50dd68 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 15:25:04 -0800 Subject: [PATCH 112/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#124433) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect U to be nonnull. --- clang/lib/AST/ParentMapContext.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ParentMapContext.cpp b/clang/lib/AST/ParentMapContext.cpp index af7d9fcdc638b..7ff492443031d 100644 --- a/clang/lib/AST/ParentMapContext.cpp +++ b/clang/lib/AST/ParentMapContext.cpp @@ -103,9 +103,9 @@ class ParentMapContext::ParentMap { static DynTypedNode getSingleDynTypedNodeFromParentMap(ParentMapPointers::mapped_type U) { - if (const auto *D = U.dyn_cast()) + if (const auto *D = dyn_cast(U)) return DynTypedNode::create(*D); - if (const auto *S = U.dyn_cast()) + if (const auto *S = dyn_cast(U)) return DynTypedNode::create(*S); return *cast(U); } From d2c7cabe0453d6a6d03c15b7ae1800b53de9e182 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 25 Jan 2025 15:25:17 -0800 Subject: [PATCH 113/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC) (#124434) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect EWC->getObject(i) to be nonnull. --- clang/lib/Sema/JumpDiagnostics.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp index d465599450e7f..4b92d67e49d7d 100644 --- a/clang/lib/Sema/JumpDiagnostics.cpp +++ b/clang/lib/Sema/JumpDiagnostics.cpp @@ -561,12 +561,12 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, // implementable but a lot of work which we haven't felt up to doing. ExprWithCleanups *EWC = cast(S); for (unsigned i = 0, e = EWC->getNumObjects(); i != e; ++i) { - if (auto *BDecl = EWC->getObject(i).dyn_cast()) + if (auto *BDecl = dyn_cast(EWC->getObject(i))) for (const auto &CI : BDecl->captures()) { VarDecl *variable = CI.getVariable(); BuildScopeInformation(variable, BDecl, origParentScope); } - else if (auto *CLE = EWC->getObject(i).dyn_cast()) + else if (auto *CLE = dyn_cast(EWC->getObject(i))) BuildScopeInformation(CLE, origParentScope); else llvm_unreachable("unexpected cleanup object type"); From 4f480481716553aa89142131f49e53e7d53c1998 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 15:28:17 -0800 Subject: [PATCH 114/432] [ELF] SHF_MERGE: avoid Fatal In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no output even if the process exits with code 1. --- lld/ELF/InputFiles.cpp | 11 +++++------ lld/test/ELF/invalid/merge-invalid-size.s | 2 +- lld/test/ELF/invalid/merge-writable.s | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index c3c6812c26202..b29c7db879fa0 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -700,13 +700,12 @@ bool ObjFile::shouldMerge(const Elf_Shdr &sec, StringRef name) { if (entSize == 0) return false; if (sec.sh_size % entSize) - Fatal(ctx) << this << ":(" << name << "): SHF_MERGE section size (" - << uint64_t(sec.sh_size) - << ") must be a multiple of sh_entsize (" << entSize << ")"; - + ErrAlways(ctx) << this << ":(" << name << "): SHF_MERGE section size (" + << uint64_t(sec.sh_size) + << ") must be a multiple of sh_entsize (" << entSize << ")"; if (sec.sh_flags & SHF_WRITE) - Fatal(ctx) << this << ":(" << name - << "): writable SHF_MERGE section is not supported"; + Err(ctx) << this << ":(" << name + << "): writable SHF_MERGE section is not supported"; return true; } diff --git a/lld/test/ELF/invalid/merge-invalid-size.s b/lld/test/ELF/invalid/merge-invalid-size.s index 71c3f98e75529..82ad1f97b4a93 100644 --- a/lld/test/ELF/invalid/merge-invalid-size.s +++ b/lld/test/ELF/invalid/merge-invalid-size.s @@ -1,6 +1,6 @@ // REQUIRES: x86 // RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux -// RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s +// RUN: not ld.lld %t.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s // CHECK: merge-invalid-size.s.tmp.o:(.foo): SHF_MERGE section size (2) must be a multiple of sh_entsize (4) .section .foo,"aM",@progbits,4 diff --git a/lld/test/ELF/invalid/merge-writable.s b/lld/test/ELF/invalid/merge-writable.s index 0c5fe92481da0..24a274b193576 100644 --- a/lld/test/ELF/invalid/merge-writable.s +++ b/lld/test/ELF/invalid/merge-writable.s @@ -1,6 +1,7 @@ // REQUIRES: x86 // RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux // RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s +// RUN: ld.lld %t.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s // CHECK: merge-writable.s.tmp.o:(.foo): writable SHF_MERGE section is not supported .section .foo,"awM",@progbits,4 From c7579bfba5969377f7fb4239cc05d6cd4a077957 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 15:50:29 -0800 Subject: [PATCH 115/432] [ELF] -o -: suppress output if disableOutput So that LLD_IN_TEST=2 ld.lld -o - a.o only writes the output once. --- lld/ELF/Writer.cpp | 8 +++++--- lld/test/ELF/stdout.s | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index fe4a0a15ae835..b7c4790655e8a 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -381,9 +381,11 @@ template void Writer::run() { if (errCount(ctx)) return; - if (auto e = buffer->commit()) - Err(ctx) << "failed to write output '" << buffer->getPath() - << "': " << std::move(e); + if (!ctx.e.disableOutput) { + if (auto e = buffer->commit()) + Err(ctx) << "failed to write output '" << buffer->getPath() + << "': " << std::move(e); + } if (!ctx.arg.cmseOutputLib.empty()) writeARMCmseImportLib(ctx); diff --git a/lld/test/ELF/stdout.s b/lld/test/ELF/stdout.s index 64cf64a72b4b6..b5ec07cfabfe9 100644 --- a/lld/test/ELF/stdout.s +++ b/lld/test/ELF/stdout.s @@ -1,7 +1,8 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o -# RUN: ld.lld %t.o -o - > %t1 +## Test that we only write to "-" once. +# RUN: LLD_IN_TEST=2 ld.lld %t.o -o - > %t1 # RUN: llvm-objdump -d %t1 | FileCheck %s # CHECK: nop From 7db789b5702714ffb6c96ad53c3136ca0a4300b2 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 16:00:51 -0800 Subject: [PATCH 116/432] [ELF] Replace a few Fatal with Err In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no output even if the process exits with code 1. Change a few Fatal to recoverable Err. --- lld/ELF/InputSection.cpp | 24 +++++++++++++-------- lld/test/ELF/compressed-input-err.s | 1 + lld/test/ELF/invalid/section-alignment.test | 1 + lld/test/ELF/invalid/section-alignment2.s | 1 + 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 42ef530b79d89..56928b7c9547b 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -71,8 +71,10 @@ InputSectionBase::InputSectionBase(InputFile *file, StringRef name, // The ELF spec states that a value of 0 means the section has // no alignment constraints. uint32_t v = std::max(addralign, 1); - if (!isPowerOf2_64(v)) - Fatal(getCtx()) << this << ": sh_addralign is not a power of 2"; + if (!isPowerOf2_64(v)) { + Err(getCtx()) << this << ": sh_addralign is not a power of 2"; + v = 1; + } this->addralign = v; // If SHF_COMPRESSED is set, parse the header. The legacy .zdebug format is no @@ -104,8 +106,10 @@ InputSectionBase::InputSectionBase(ObjFile &file, // We reject object files having insanely large alignments even though // they are allowed by the spec. I think 4GB is a reasonable limitation. // We might want to relax this in the future. - if (hdr.sh_addralign > UINT32_MAX) - Fatal(getCtx()) << &file << ": section sh_addralign is too large"; + if (hdr.sh_addralign > UINT32_MAX) { + Err(getCtx()) << &file << ": section sh_addralign is too large"; + addralign = 1; + } } size_t InputSectionBase::getSize() const { @@ -123,7 +127,7 @@ static void decompressAux(Ctx &ctx, const InputSectionBase &sec, uint8_t *out, if (Error e = hdr->ch_type == ELFCOMPRESS_ZLIB ? compression::zlib::decompress(compressed, out, size) : compression::zstd::decompress(compressed, out, size)) - Fatal(ctx) << &sec << ": decompress failed: " << std::move(e); + Err(ctx) << &sec << ": decompress failed: " << std::move(e); } void InputSectionBase::decompress() const { @@ -649,9 +653,11 @@ static uint64_t getRISCVUndefinedRelativeWeakVA(uint64_t type, uint64_t p) { // of the RW segment. static uint64_t getARMStaticBase(const Symbol &sym) { OutputSection *os = sym.getOutputSection(); - if (!os || !os->ptLoad || !os->ptLoad->firstSec) - Fatal(os->ctx) << "SBREL relocation to " << sym.getName() - << " without static base"; + if (!os || !os->ptLoad || !os->ptLoad->firstSec) { + Err(os->ctx) << "SBREL relocation to " << sym.getName() + << " without static base"; + return 0; + } return os->ptLoad->firstSec->addr; } @@ -1304,7 +1310,7 @@ template void InputSection::writeTo(Ctx &ctx, uint8_t *buf) { if (Error e = hdr->ch_type == ELFCOMPRESS_ZLIB ? compression::zlib::decompress(compressed, buf, size) : compression::zstd::decompress(compressed, buf, size)) - Fatal(ctx) << this << ": decompress failed: " << std::move(e); + Err(ctx) << this << ": decompress failed: " << std::move(e); uint8_t *bufEnd = buf + size; relocate(ctx, buf, bufEnd); return; diff --git a/lld/test/ELF/compressed-input-err.s b/lld/test/ELF/compressed-input-err.s index 83b1f62d7e495..7251585ed5d70 100644 --- a/lld/test/ELF/compressed-input-err.s +++ b/lld/test/ELF/compressed-input-err.s @@ -9,6 +9,7 @@ # RUN: yaml2obj --docnum=3 %s -o %t3.o # RUN: not ld.lld %t3.o -o /dev/null -shared 2>&1 | FileCheck %s +# RUN: ld.lld %t3.o -o /dev/null -shared --noinhibit-exec ## Check we are able to report zlib decompress errors. # CHECK: error: {{.*}}.o:(.debug_info): decompress failed: zlib error: Z_DATA_ERROR diff --git a/lld/test/ELF/invalid/section-alignment.test b/lld/test/ELF/invalid/section-alignment.test index 8099ec01849b6..32e673f82992b 100644 --- a/lld/test/ELF/invalid/section-alignment.test +++ b/lld/test/ELF/invalid/section-alignment.test @@ -1,5 +1,6 @@ # RUN: yaml2obj %s -o %t # RUN: not ld.lld %t -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld %t -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s ## In current lld implementation, we do not accept sh_addralign ## larger than UINT32_MAX. diff --git a/lld/test/ELF/invalid/section-alignment2.s b/lld/test/ELF/invalid/section-alignment2.s index c130bbbaa071f..c180860ca4127 100644 --- a/lld/test/ELF/invalid/section-alignment2.s +++ b/lld/test/ELF/invalid/section-alignment2.s @@ -1,5 +1,6 @@ # RUN: yaml2obj %s -o %t.o # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld %t.o -o /dev/null --noinhibit-exec # CHECK: error: {{.*}}.o:(.text): sh_addralign is not a power of 2 From 6b87f01aaaa9d7c6eef8b66e48f13eb8492c7503 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 16:20:27 -0800 Subject: [PATCH 117/432] [ELF] MergeInputSection: replace Fatal with Err In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no output even if the process exits with code 1. Change a few Fatal to recoverable Err. --- lld/ELF/InputSection.cpp | 13 +++++++++---- lld/test/ELF/merge-string-error.s | 11 ----------- lld/test/ELF/mergeable-errors.s | 1 + lld/test/ELF/relocation-past-merge-end.s | 9 ++++++++- 4 files changed, 18 insertions(+), 16 deletions(-) delete mode 100644 lld/test/ELF/merge-string-error.s diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 56928b7c9547b..52c472bb89caf 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -1433,8 +1433,11 @@ static size_t findNull(StringRef s, size_t entSize) { void MergeInputSection::splitStrings(StringRef s, size_t entSize) { const bool live = !(flags & SHF_ALLOC) || !getCtx().arg.gcSections; const char *p = s.data(), *end = s.data() + s.size(); - if (!std::all_of(end - entSize, end, [](char c) { return c == 0; })) - Fatal(getCtx()) << this << ": string is not null terminated"; + if (!std::all_of(end - entSize, end, [](char c) { return c == 0; })) { + Err(getCtx()) << this << ": string is not null terminated"; + pieces.emplace_back(entSize, 0, false); + return; + } if (entSize == 1) { // Optimize the common case. do { @@ -1494,8 +1497,10 @@ void MergeInputSection::splitIntoPieces() { } SectionPiece &MergeInputSection::getSectionPiece(uint64_t offset) { - if (content().size() <= offset) - Fatal(getCtx()) << this << ": offset is outside the section"; + if (content().size() <= offset) { + Err(getCtx()) << this << ": offset is outside the section"; + return pieces[0]; + } return partition_point( pieces, [=](SectionPiece p) { return p.inputOff <= offset; })[-1]; } diff --git a/lld/test/ELF/merge-string-error.s b/lld/test/ELF/merge-string-error.s deleted file mode 100644 index bd77a4c1dce87..0000000000000 --- a/lld/test/ELF/merge-string-error.s +++ /dev/null @@ -1,11 +0,0 @@ -// REQUIRES: x86 -// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s - - .section .rodata.str1.1,"aMS",@progbits,1 - .asciz "abc" - - .data - .quad .rodata.str1.1 + 4 - -// CHECK: merge-string-error.s.tmp.o:(.rodata.str1.1): offset is outside the section diff --git a/lld/test/ELF/mergeable-errors.s b/lld/test/ELF/mergeable-errors.s index d67cd91c97fbf..b155d581046a8 100644 --- a/lld/test/ELF/mergeable-errors.s +++ b/lld/test/ELF/mergeable-errors.s @@ -1,6 +1,7 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld %t.o -o /dev/null --noinhibit-exec # CHECK: error: {{.*}}.o:(.mergeable): string is not null terminated diff --git a/lld/test/ELF/relocation-past-merge-end.s b/lld/test/ELF/relocation-past-merge-end.s index 15214a5a4fc05..1dced95c49ac2 100644 --- a/lld/test/ELF/relocation-past-merge-end.s +++ b/lld/test/ELF/relocation-past-merge-end.s @@ -1,9 +1,16 @@ // REQUIRES: x86 // RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux // RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s -// CHECK: relocation-past-merge-end.s.tmp.o:(.foo): offset is outside the section +// RUN: ld.lld %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s +// CHECK: .o:(.foo): offset is outside the section +// CHECCK: .o:(.rodata.str1.1): offset is outside the section .data .quad .foo + 10 +.quad .rodata.str1.1 + 4 + .section .foo,"aM",@progbits,4 .quad 0 + +.section .rodata.str1.1,"aMS",@progbits,1 +.asciz "abc" From a9e92beb253d4bbd7636d99f100940534f3a7f36 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 16:54:19 -0800 Subject: [PATCH 118/432] [ELF] openAuxiliaryFile: open /dev/null if disableOutput and filename is "-" So that LLD_IN_TEST=2 ld.lld --print-archive-stats=- a.o (and -Map -) only writes the output once. --- lld/ELF/Driver.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 13e8f8ce6df20..c0a27b3939a54 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -105,6 +105,13 @@ llvm::raw_fd_ostream Ctx::openAuxiliaryFile(llvm::StringRef filename, using namespace llvm::sys::fs; OpenFlags flags = auxiliaryFiles.insert(filename).second ? OF_None : OF_Append; + if (e.disableOutput && filename == "-") { +#ifdef _WIN32 + filename = "NUL"; +#else + filename = "/dev/null"; +#endif + } return {filename, ec, flags}; } From b7195e8e040d57bbf502f34ec84d71bd123f85b8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 16:58:52 -0800 Subject: [PATCH 119/432] [ELF,test] Add env LLD_IN_TEST=1 to make some tests work if RUN_LLD_MAIN_TWICE --- lld/test/ELF/basic.s | 2 +- lld/test/ELF/stdout.s | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lld/test/ELF/basic.s b/lld/test/ELF/basic.s index 587fd1641500a..b01f51eb4a2c7 100644 --- a/lld/test/ELF/basic.s +++ b/lld/test/ELF/basic.s @@ -220,7 +220,7 @@ _start: ## Test erroring on a recursive response file, but only once. # RUN: echo @%t.responsefile > %t.responsefile -# RUN: not ld.lld %t @%t.responsefile 2>&1 | FileCheck %s --check-prefix=RECRSP +# RUN: env LLD_IN_TEST=1 not ld.lld %t @%t.responsefile 2>&1 | FileCheck %s --check-prefix=RECRSP # RECRSP: recursive expansion of: '{{.*}}.responsefile' # RECRSP-NOT: recursive expansion of diff --git a/lld/test/ELF/stdout.s b/lld/test/ELF/stdout.s index b5ec07cfabfe9..e33ab3f792c40 100644 --- a/lld/test/ELF/stdout.s +++ b/lld/test/ELF/stdout.s @@ -2,7 +2,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o ## Test that we only write to "-" once. -# RUN: LLD_IN_TEST=2 ld.lld %t.o -o - > %t1 +# RUN: env LLD_IN_TEST=2 ld.lld %t.o -o - > %t1 # RUN: llvm-objdump -d %t1 | FileCheck %s # CHECK: nop From f21c35d54f8f7af9d0c64b566cabbc4f796a54df Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 17:29:28 -0800 Subject: [PATCH 120/432] [ELF] Replace some Fatal with Err In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no output even if the process exits with code 1. Change a few Fatal to recoverable Err. --- lld/ELF/Driver.cpp | 6 ++- lld/ELF/InputFiles.cpp | 52 +++++++++++++++---------- lld/ELF/Relocations.cpp | 6 ++- lld/test/ELF/invalid/section-index.test | 1 + lld/test/ELF/invalid/symbol-name.test | 1 + 5 files changed, 41 insertions(+), 25 deletions(-) diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index c0a27b3939a54..770163f4de086 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -2425,8 +2425,10 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) { unsigned size; const char *err = nullptr; uint64_t symIndex = decodeULEB128(cur, &size, contents.end(), &err); - if (err) - Fatal(ctx) << f << ": could not decode addrsig section: " << err; + if (err) { + Err(ctx) << f << ": could not decode addrsig section: " << err; + break; + } markAddrsig(icfSafe, syms[symIndex]); cur += size; } diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index b29c7db879fa0..e236057a0d6d1 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -921,17 +921,18 @@ static void readGnuProperty(Ctx &ctx, const InputSection &sec, using Elf_Note = typename ELFT::Note; ArrayRef data = sec.content(); - auto reportFatal = [&](const uint8_t *place, const Twine &msg) { - Fatal(ctx) << sec.file << ":(" << sec.name << "+0x" - << Twine::utohexstr(place - sec.content().data()) - << "): " << msg; + auto err = [&](const uint8_t *place) -> ELFSyncStream { + auto diag = Err(ctx); + diag << sec.file << ":(" << sec.name << "+0x" + << Twine::utohexstr(place - sec.content().data()) << "): "; + return diag; }; while (!data.empty()) { // Read one NOTE record. auto *nhdr = reinterpret_cast(data.data()); if (data.size() < sizeof(Elf_Nhdr) || data.size() < nhdr->getSize(sec.addralign)) - reportFatal(data.data(), "data is too short"); + return void(err(data.data()) << "data is too short"); Elf_Note note(*nhdr); if (nhdr->n_type != NT_GNU_PROPERTY_TYPE_0 || note.getName() != "GNU") { @@ -948,30 +949,32 @@ static void readGnuProperty(Ctx &ctx, const InputSection &sec, while (!desc.empty()) { const uint8_t *place = desc.data(); if (desc.size() < 8) - reportFatal(place, "program property is too short"); + return void(err(place) << "program property is too short"); uint32_t type = read32(desc.data()); uint32_t size = read32(desc.data() + 4); desc = desc.slice(8); if (desc.size() < size) - reportFatal(place, "program property is too short"); + return void(err(place) << "program property is too short"); if (type == featureAndType) { // We found a FEATURE_1_AND field. There may be more than one of these // in a .note.gnu.property section, for a relocatable object we // accumulate the bits set. if (size < 4) - reportFatal(place, "FEATURE_1_AND entry is too short"); + return void(err(place) << "FEATURE_1_AND entry is too short"); f.andFeatures |= read32(desc.data()); } else if (ctx.arg.emachine == EM_AARCH64 && type == GNU_PROPERTY_AARCH64_FEATURE_PAUTH) { if (!f.aarch64PauthAbiCoreInfo.empty()) { - reportFatal(data.data(), - "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are " - "not supported"); + return void( + err(data.data()) + << "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are " + "not supported"); } else if (size != 16) { - reportFatal(data.data(), "GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry " - "is invalid: expected 16 bytes, but got " + - Twine(size)); + return void(err(data.data()) + << "GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry " + "is invalid: expected 16 bytes, but got " + << size); } f.aarch64PauthAbiCoreInfo = desc; } @@ -1173,8 +1176,10 @@ void ObjFile::initSectionsAndLocalSyms(bool ignoreComdats) { secIdx = check(getExtendedSymbolTableIndex(eSym, i, shndxTable)); else if (secIdx >= SHN_LORESERVE) secIdx = 0; - if (LLVM_UNLIKELY(secIdx >= sections.size())) - Fatal(ctx) << this << ": invalid section index: " << secIdx; + if (LLVM_UNLIKELY(secIdx >= sections.size())) { + Err(ctx) << this << ": invalid section index: " << secIdx; + secIdx = 0; + } if (LLVM_UNLIKELY(eSym.getBinding() != STB_LOCAL)) ErrAlways(ctx) << this << ": non-local symbol (" << i << ") found at index < .symtab's sh_info (" << end << ")"; @@ -1183,9 +1188,12 @@ void ObjFile::initSectionsAndLocalSyms(bool ignoreComdats) { uint8_t type = eSym.getType(); if (type == STT_FILE) sourceFile = CHECK2(eSym.getName(stringTable), this); - if (LLVM_UNLIKELY(stringTable.size() <= eSym.st_name)) - Fatal(ctx) << this << ": invalid symbol name offset"; - StringRef name(stringTable.data() + eSym.st_name); + unsigned stName = eSym.st_name; + if (LLVM_UNLIKELY(stringTable.size() <= stName)) { + Err(ctx) << this << ": invalid symbol name offset"; + stName = 0; + } + StringRef name(stringTable.data() + stName); symbols[i] = reinterpret_cast(locals + i); if (eSym.st_shndx == SHN_UNDEF || sec == &InputSection::discarded) @@ -1236,8 +1244,10 @@ template void ObjFile::postParse() { secIdx = 0; } - if (LLVM_UNLIKELY(secIdx >= sections.size())) - Fatal(ctx) << this << ": invalid section index: " << secIdx; + if (LLVM_UNLIKELY(secIdx >= sections.size())) { + Err(ctx) << this << ": invalid section index: " << secIdx; + continue; + } InputSectionBase *sec = sections[secIdx]; if (sec == &InputSection::discarded) { if (sym.traced) { diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 76b151b93d517..629702b45965b 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -428,8 +428,10 @@ class OffsetGetter { if (j == fdes.begin() || j[-1].inputOff + j[-1].size <= off) { while (i != cies.end() && i->inputOff <= off) ++i; - if (i == cies.begin() || i[-1].inputOff + i[-1].size <= off) - Fatal(ctx) << ".eh_frame: relocation is not in any piece"; + if (i == cies.begin() || i[-1].inputOff + i[-1].size <= off) { + Err(ctx) << ".eh_frame: relocation is not in any piece"; + return 0; + } it = i; } diff --git a/lld/test/ELF/invalid/section-index.test b/lld/test/ELF/invalid/section-index.test index cc8c6d067265a..370597b7b7a2d 100644 --- a/lld/test/ELF/invalid/section-index.test +++ b/lld/test/ELF/invalid/section-index.test @@ -3,6 +3,7 @@ # RUN: yaml2obj %s -o %t1.o # RUN: not ld.lld %t1.o -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld %t1.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s # CHECK: {{.*}}1.o: invalid section index: 256 !ELF diff --git a/lld/test/ELF/invalid/symbol-name.test b/lld/test/ELF/invalid/symbol-name.test index 1ae76f0bd81e7..73284a1b9b842 100644 --- a/lld/test/ELF/invalid/symbol-name.test +++ b/lld/test/ELF/invalid/symbol-name.test @@ -1,5 +1,6 @@ # RUN: yaml2obj %s -o %t.o # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld %t.o -o /dev/null --noinhibit-exec # CHECK: error: {{.*}}.o: invalid symbol name offset ## YAML below contains symbol with name offset in st_name From 988978f964fb84cb99c83e6cd260dcc395afb6c2 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 17:51:24 -0800 Subject: [PATCH 121/432] [ELF,test] Add env LLD_IN_TEST=1 to make some tests work if RUN_LLD_MAIN_TWICE --- lld/test/ELF/invalid/bad-reloc-target.test | 2 +- lld/test/ELF/lto/cache-warnings.ll | 1 + lld/test/ELF/lto/ltopasses-custom.ll | 4 ++-- lld/test/ELF/lto/verify-invalid.ll | 6 +++--- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lld/test/ELF/invalid/bad-reloc-target.test b/lld/test/ELF/invalid/bad-reloc-target.test index 88b4cdf96779f..6a1619e81b80b 100644 --- a/lld/test/ELF/invalid/bad-reloc-target.test +++ b/lld/test/ELF/invalid/bad-reloc-target.test @@ -51,7 +51,7 @@ Symbols: ## Relocation refers to a symbol with index larger than ## symbol table size. Check we report it. # RUN: yaml2obj --docnum=3 %s -o %t2.o -# RUN: not ld.lld %t2.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR3 +# RUN: env LLD_IN_TEST=1 not ld.lld %t2.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR3 # ERR3: error: {{.*}}.o: invalid symbol index --- !ELF diff --git a/lld/test/ELF/lto/cache-warnings.ll b/lld/test/ELF/lto/cache-warnings.ll index d8c5ea963ec13..d0224d5426ff3 100644 --- a/lld/test/ELF/lto/cache-warnings.ll +++ b/lld/test/ELF/lto/cache-warnings.ll @@ -1,4 +1,5 @@ ; REQUIRES: x86, shell +; UNSUPPORTED: main-run-twice ; RUN: opt -module-hash -module-summary %s -o %t.o ; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o %t2.o diff --git a/lld/test/ELF/lto/ltopasses-custom.ll b/lld/test/ELF/lto/ltopasses-custom.ll index ecb024cecade4..e37083ca8b8c7 100644 --- a/lld/test/ELF/lto/ltopasses-custom.ll +++ b/lld/test/ELF/lto/ltopasses-custom.ll @@ -24,13 +24,13 @@ define void @barrier() { ; ATOMIC-NEXT: ret void ; Check that invalid passes are rejected gracefully. -; RUN: not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \ +; RUN: env LLD_IN_TEST=1 not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \ ; RUN: --lto-newpm-passes=iamnotapass -shared 2>&1 | \ ; RUN: FileCheck %s --check-prefix=INVALID ; INVALID: unable to parse pass pipeline description 'iamnotapass': unknown pass name 'iamnotapass' ; Check that invalid AA pipelines are rejected gracefully. -; RUN: not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \ +; RUN: env LLD_IN_TEST=1 not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \ ; RUN: --lto-newpm-passes=globaldce --lto-aa-pipeline=patatino \ ; RUN: -shared 2>&1 | \ ; RUN: FileCheck %s --check-prefix=INVALIDAA diff --git a/lld/test/ELF/lto/verify-invalid.ll b/lld/test/ELF/lto/verify-invalid.ll index d97d0e1b78b8c..cb8bb389a608b 100644 --- a/lld/test/ELF/lto/verify-invalid.ll +++ b/lld/test/ELF/lto/verify-invalid.ll @@ -1,10 +1,10 @@ ; REQUIRES: x86 ; RUN: llvm-as %s -o %t.o -; RUN: ld.lld %t.o -o %t2 --lto-debug-pass-manager \ +; RUN: env LLD_IN_TEST=1 ld.lld %t.o -o %t2 --lto-debug-pass-manager \ ; RUN: 2>&1 | FileCheck -check-prefix=DEFAULT-NPM %s -; RUN: ld.lld %t.o -o %t2 --lto-debug-pass-manager \ +; RUN: env LLD_IN_TEST=1 ld.lld %t.o -o %t2 --lto-debug-pass-manager \ ; RUN: -disable-verify 2>&1 | FileCheck -check-prefix=DISABLE-NPM %s -; RUN: ld.lld %t.o -o %t2 --lto-debug-pass-manager \ +; RUN: env LLD_IN_TEST=1 ld.lld %t.o -o %t2 --lto-debug-pass-manager \ ; RUN: --plugin-opt=disable-verify 2>&1 | FileCheck -check-prefix=DISABLE-NPM %s target triple = "x86_64-unknown-linux-gnu" From f359c1f524bf826eba355b8863a870450eb747b0 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 18:01:52 -0800 Subject: [PATCH 122/432] [ELF] Disable error handling script if disableOutput Fix ELF/error-handling-script-linux.test when LLD_IN_TEST=2 is set. --- lld/Common/ErrorHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/Common/ErrorHandler.cpp b/lld/Common/ErrorHandler.cpp index 716bce54258ce..a11960325a9cd 100644 --- a/lld/Common/ErrorHandler.cpp +++ b/lld/Common/ErrorHandler.cpp @@ -289,7 +289,7 @@ void ErrorHandler::error(const Twine &msg) { void ErrorHandler::error(const Twine &msg, ErrorTag tag, ArrayRef args) { - if (errorHandlingScript.empty()) { + if (errorHandlingScript.empty() || disableOutput) { error(msg); return; } From 18335f4800ae5491a11e74a574969d716acddce7 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 18:06:22 -0800 Subject: [PATCH 123/432] [ELF] Ignore --time-trace if disableOutput To avoid prevent generating two JSON for LLD_IN_TEST=2 ld.lld --time-trace. --- lld/ELF/Driver.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 770163f4de086..7e0d3fca31353 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1463,7 +1463,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) { } ctx.arg.thinLTOModulesToCompile = args::getStrings(args, OPT_thinlto_single_module_eq); - ctx.arg.timeTraceEnabled = args.hasArg(OPT_time_trace_eq); + ctx.arg.timeTraceEnabled = + args.hasArg(OPT_time_trace_eq) && !ctx.e.disableOutput; ctx.arg.timeTraceGranularity = args::getInteger(args, OPT_time_trace_granularity, 500); ctx.arg.trace = args.hasArg(OPT_trace); From c1f10ef0a5c15f1dccf87ff07699055297c715a5 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 18:13:42 -0800 Subject: [PATCH 124/432] [ELF] SHF_LINK_ORDER: replace Fatal with ErrAlways In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no output even if the process exits with code 1. Change the Fatal to ErrAlways (not-recoverable) as subsequent code assumes SHF_LINK_ORDER sh_link is correct. --- lld/ELF/InputFiles.cpp | 8 +++++--- lld/test/ELF/invalid/linkorder-invalid-sec.test | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index e236057a0d6d1..eba4c234d3f16 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -890,9 +890,11 @@ void ObjFile::initializeSections(bool ignoreComdats, InputSectionBase *linkSec = nullptr; if (sec.sh_link < size) linkSec = this->sections[sec.sh_link]; - if (!linkSec) - Fatal(ctx) << this - << ": invalid sh_link index: " << uint32_t(sec.sh_link); + if (!linkSec) { + ErrAlways(ctx) << this + << ": invalid sh_link index: " << uint32_t(sec.sh_link); + continue; + } // A SHF_LINK_ORDER section is discarded if its linked-to section is // discarded. diff --git a/lld/test/ELF/invalid/linkorder-invalid-sec.test b/lld/test/ELF/invalid/linkorder-invalid-sec.test index a2f4ee8f5bc2b..e0132956f0ba1 100644 --- a/lld/test/ELF/invalid/linkorder-invalid-sec.test +++ b/lld/test/ELF/invalid/linkorder-invalid-sec.test @@ -1,6 +1,6 @@ # REQUIRES: x86 # RUN: yaml2obj %s -o %t.o -# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld %t.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s # CHECK: invalid sh_link index: 12345 --- !ELF From c1ec5beb4ab36c2c4d99ed6d735d217e74364771 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 25 Jan 2025 18:31:42 -0800 Subject: [PATCH 125/432] [clang-format] Fix a TableGen crash on comment after l_paren (#124380) Fixes #124248. --- clang/lib/Format/TokenAnnotator.cpp | 10 ++++------ clang/unittests/Format/FormatTestTableGen.cpp | 3 +++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index bc41d43d1438c..655766178fbb0 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1115,7 +1115,7 @@ class AnnotatingParser { } if (!CurrentToken || CurrentToken->isNot(tok::l_paren)) return false; - skipToNextNonComment(); + next(); // FIXME: Hack using inheritance to child context Contexts.back().IsTableGenBangOpe = true; bool Result = parseParens(); @@ -1124,12 +1124,10 @@ class AnnotatingParser { } // SimpleValue 9: Cond operator if (Tok->is(TT_TableGenCondOperator)) { - Tok = CurrentToken; - skipToNextNonComment(); - if (!Tok || Tok->isNot(tok::l_paren)) + if (!CurrentToken || CurrentToken->isNot(tok::l_paren)) return false; - bool Result = parseParens(); - return Result; + next(); + return parseParens(); } // We have to check identifier at the last because the kind of bang/cond // operators are also identifier. diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp index 7771f6a109a9a..92377c31f2e91 100644 --- a/clang/unittests/Format/FormatTestTableGen.cpp +++ b/clang/unittests/Format/FormatTestTableGen.cpp @@ -101,6 +101,9 @@ TEST_F(FormatTestTableGen, BangOperators) { " \"zerozero\",\n" " true: // default\n" " \"positivepositive\");\n" + " let CondOpe3WithCommentAfterLParen = !cond(\n" + " // comment\n" + " !eq(/* comment */ x, 0): \"zero\");\n" "}"); } From 9b6990ff2531942d534c9ef7db728af2437c3329 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 25 Jan 2025 19:23:12 -0800 Subject: [PATCH 126/432] [Github][CI] Add Windows Premerge Job for Testing (#122661) This patch adds a windows premerge job for testing. We plan to enable this by default soon once we have evaluated stability and have reasonable reason to believe the system is reliable. --- .github/workflows/premerge.yaml | 64 ++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 30f4fc807f3a5..54d6e1bf092cf 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -16,7 +16,7 @@ jobs: if: github.repository_owner == 'llvm' runs-on: llvm-premerge-linux-runners concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true steps: - name: Checkout LLVM @@ -70,3 +70,65 @@ jobs: export CXX=/opt/llvm/bin/clang++ ./.ci/monolithic-linux.sh "$(echo ${linux_projects} | tr ' ' ';')" "$(echo ${linux_check_targets})" "$(echo ${linux_runtimes} | tr ' ' ';')" "$(echo ${linux_runtime_check_targets})" + + premerge-checks-windows: + if: github.repository_owner == 'llvm' + runs-on: llvm-premerge-windows-runners + concurrency: + group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + defaults: + run: + shell: bash + steps: + - name: Checkout LLVM + uses: actions/checkout@v4 + with: + fetch-depth: 2 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2.14 + with: + variant: "sccache" + max-size: "2000M" + - name: Compute Projects + id: vars + run: | + modified_files=$(git diff --name-only HEAD~1...HEAD) + modified_dirs=$(echo "$modified_files" | cut -d'/' -f1 | sort | uniq) + + echo $modified_files + echo $modified_dirs + + . ./.ci/compute-projects.sh + + all_projects="bolt clang clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl" + modified_projects="$(keep-modified-projects ${all_projects})" + + windows_projects_to_test=$(exclude-windows $(compute-projects-to-test 1 ${modified_projects})) + windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq | tr -d '\r' | tr '\n' ' ') + windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq | tr -d '\r' | tr '\n' ';') + + if [[ "${windows_projects}" == "" ]]; then + echo "No projects to build" + fi + + echo "Building projects: ${windows_projects}" + echo "Running project checks targets: ${windows_check_targets}" + + echo "windows-projects=${windows_projects}" >> $GITHUB_OUTPUT + echo "windows-check-targets=${windows_check_targets}" >> $GITHUB_OUTPUT + - name: Build and Test + # Mark the job as a success even if the step fails so that people do + # not get notified while the new premerge pipeline is in an + # experimental state. + # TODO(boomanaiden154): Remove this once the pipeline is stable and we + # are ready for people to start recieving notifications. + continue-on-error: true + if: ${{ steps.vars.outputs.windows-projects != '' }} + shell: cmd + run: | + set MAX_PARALLEL_COMPILE_JOBS=64 + set MAX_PARALLEL_LINK_JOBS=64 + call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64 + bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}" + From 6bb70a94da1b5c53143537f1d2e96602a74331ca Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Sat, 25 Jan 2025 19:48:46 -0800 Subject: [PATCH 127/432] workflows/release-binaries: Enable builds on Linux/AArch64 (#120786) --- .github/workflows/release-binaries-all.yml | 1 + .github/workflows/release-binaries.yml | 24 ++++++++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml index d5b2d33286101..d18b9b0b5c2ff 100644 --- a/.github/workflows/release-binaries-all.yml +++ b/.github/workflows/release-binaries-all.yml @@ -83,6 +83,7 @@ jobs: matrix: runs-on: - ubuntu-22.04 + - ubuntu-22.04-arm - macos-13 - macos-14 diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index f9a264e7cf48f..2ca4aea8a3b0e 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -18,6 +18,7 @@ on: type: choice options: - ubuntu-22.04 + - ubuntu-22.04-arm - macos-13 - macos-14 @@ -55,6 +56,7 @@ jobs: ref: ${{ steps.vars.outputs.ref }} upload: ${{ steps.vars.outputs.upload }} target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }} + ccache: ${{ steps.vars.outputs.ccache }} build-flang: ${{ steps.vars.outputs.build-flang }} enable-pgo: ${{ steps.vars.outputs.enable-pgo }} release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }} @@ -119,8 +121,16 @@ jobs: echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT - # Detect necessary CMake flags target="$RUNNER_OS-$RUNNER_ARCH" + # The hendrikmuhs/ccache-action action does not support installing sccache + # on arm64 Linux. + if [ "$target" = "Linux-ARM64" ]; then + echo ccache=ccache >> $GITHUB_OUTPUT + else + echo ccache=sccache >> $GITHUB_OUTPUT + fi + + # Detect necessary CMake flags echo "enable-pgo=false" >> $GITHUB_OUTPUT target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF" # The macOS builds try to cross compile some libraries so we need to @@ -146,7 +156,7 @@ jobs: echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT echo "build-flang=$build_flang" >> $GITHUB_OUTPUT case "${{ inputs.runs-on }}" in - ubuntu-22.04) + ubuntu-22.04*) build_runs_on="depot-${{ inputs.runs-on }}-16" test_runs_on=$build_runs_on ;; @@ -221,12 +231,14 @@ jobs: with: # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174 max-size: 2G - key: sccache-${{ runner.os }}-${{ runner.arch }}-release - variant: sccache + key: ${{ needs.prepare.outputs.ccache }}-${{ runner.os }}-${{ runner.arch }}-release + variant: ${{ needs.prepare.outputs.ccache }} - name: Configure id: build shell: bash + env: + CCACHE_BIN: ${{ needs.prepare.outputs.ccache }} run: | # There were some issues on the ARM64 MacOS runners with trying to build x86 object, # so we need to set some extra cmake flags to disable this. @@ -235,8 +247,8 @@ jobs: -C clang/cmake/caches/Release.cmake \ -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \ -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" \ - -DCMAKE_C_COMPILER_LAUNCHER=sccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=sccache + -DCMAKE_C_COMPILER_LAUNCHER=$CCACHE_BIN \ + -DCMAKE_CXX_COMPILER_LAUNCHER=$CCACHE_BIN - name: Build shell: bash run: | From 44b85743498a88cb9fd1281bdfac47c93fcf6fee Mon Sep 17 00:00:00 2001 From: Palmer Date: Sat, 25 Jan 2025 23:30:55 -0500 Subject: [PATCH 128/432] [AArch64] Fix movk parsing with an .equ operand (#124428) Prior to 5da801386c2b820a4596fc6d8da6b5f4a6da94b4, this code worked: .equ p4_low_b0, 0x0000 movk x1, p4_low_b0, lsl 16 (The code above is from the isa-l project - I discovered this issue while trying to compile it with clang 19 on MacOS on aarch64) That commit fixed a different bug, but accidentally broke the case where the second operand to movk is not a literal. In 442f066fc464e953b7783230e95ccf2a67ebfb38, a fix was applied to handle the case where the second operand is a value like "(Val) >> 16". However, that didn't appear to fix the test case in this commit. In this commit, we extend the change to handle the case where the second operand is a identifier defined by .equ. Fixes #124427 --- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 4 +++- llvm/test/MC/AArch64/basic-a64-instructions.s | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index d3eda48f3276e..27b052825d213 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -5017,7 +5017,9 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, return true; E = SMLoc::getFromPointer(getLoc().getPointer() - 1); Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext())); - return false; + + // Parse an optional shift/extend modifier. + return parseOptionalShiftExtend(getTok()); } case AsmToken::Integer: case AsmToken::Real: diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s index 0ae23d672e4a3..14ac11f581a55 100644 --- a/llvm/test/MC/AArch64/basic-a64-instructions.s +++ b/llvm/test/MC/AArch64/basic-a64-instructions.s @@ -3347,6 +3347,11 @@ _func: // CHECK: mov x2, #5299989643264 // encoding: [0x42,0x9a,0xc0,0xd2] // CHECK: movk xzr, #{{4321|0x10e1}}, lsl #48 // encoding: [0x3f,0x1c,0xe2,0xf2] + .equ equvalue, 0x0001 + movk x1, equvalue, lsl 16 +// CHECK: .set equvalue, 1 +// CHECK-NEXT: movk x1, #1, lsl #16 // encoding: [0x21,0x00,0xa0,0xf2] + movz x2, #:abs_g0:sym movk w3, #:abs_g0_nc:sym From 753028bc81c1a556eaaaf45ac77ca0cf4c7a3b4a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jan 2025 22:04:03 -0800 Subject: [PATCH 129/432] [Xtensa] Move XtensaUtils to MCTargetDesc PR #121118 attempted to introduce `checkRegister` used by XtensaDisassembler. Since `checkRegister` and other functions in XtensaUtils.cpp cannot link against XtensaCodeGen, move them to XtensaDesc, which can be used by XtensaDisassembler. Pull Request: https://github.com/llvm/llvm-project/pull/123969 --- llvm/lib/Target/Xtensa/CMakeLists.txt | 1 - .../MCTargetDesc/XtensaMCTargetDesc.cpp | 42 +++++++++++++ .../Xtensa/MCTargetDesc/XtensaMCTargetDesc.h | 10 ++++ llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp | 4 +- llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp | 4 +- llvm/lib/Target/Xtensa/XtensaUtils.cpp | 59 ------------------- llvm/lib/Target/Xtensa/XtensaUtils.h | 27 --------- 7 files changed, 56 insertions(+), 91 deletions(-) delete mode 100644 llvm/lib/Target/Xtensa/XtensaUtils.cpp delete mode 100644 llvm/lib/Target/Xtensa/XtensaUtils.h diff --git a/llvm/lib/Target/Xtensa/CMakeLists.txt b/llvm/lib/Target/Xtensa/CMakeLists.txt index 726efadc87c0b..4fc1ba6dfa650 100644 --- a/llvm/lib/Target/Xtensa/CMakeLists.txt +++ b/llvm/lib/Target/Xtensa/CMakeLists.txt @@ -24,7 +24,6 @@ add_llvm_target(XtensaCodeGen XtensaRegisterInfo.cpp XtensaSubtarget.cpp XtensaTargetMachine.cpp - XtensaUtils.cpp LINK_COMPONENTS AsmPrinter diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp index 2653c293dc0c4..fc23c2356825f 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp @@ -32,6 +32,48 @@ using namespace llvm; +bool Xtensa::isValidAddrOffset(int Scale, int64_t OffsetVal) { + bool Valid = false; + + switch (Scale) { + case 1: + Valid = (OffsetVal >= 0 && OffsetVal <= 255); + break; + case 2: + Valid = (OffsetVal >= 0 && OffsetVal <= 510) && ((OffsetVal & 0x1) == 0); + break; + case 4: + Valid = (OffsetVal >= 0 && OffsetVal <= 1020) && ((OffsetVal & 0x3) == 0); + break; + default: + break; + } + return Valid; +} + +bool Xtensa::isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset) { + int Scale = 0; + + switch (Opcode) { + case Xtensa::L8UI: + case Xtensa::S8I: + Scale = 1; + break; + case Xtensa::L16SI: + case Xtensa::L16UI: + case Xtensa::S16I: + Scale = 2; + break; + case Xtensa::LEA_ADD: + return (Offset >= -128 && Offset <= 127); + default: + // assume that MI is 32-bit load/store operation + Scale = 4; + break; + } + return isValidAddrOffset(Scale, Offset); +} + static MCAsmInfo *createXtensaMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT, const MCTargetOptions &Options) { diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h index 0e075be0df07f..6be54867d84a7 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h @@ -28,6 +28,7 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class MCTargetOptions; +class MachineInstr; class StringRef; class Target; class raw_ostream; @@ -43,6 +44,15 @@ MCAsmBackend *createXtensaMCAsmBackend(const Target &T, const MCTargetOptions &Options); std::unique_ptr createXtensaObjectWriter(uint8_t OSABI, bool IsLittleEndian); + +namespace Xtensa { +// Check address offset for load/store instructions. +// The offset should be multiple of scale. +bool isValidAddrOffset(int Scale, int64_t OffsetVal); + +// Check address offset for load/store instructions. +bool isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset); +} // namespace Xtensa } // end namespace llvm // Defines symbolic names for Xtensa registers. diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp index ef14095d18efb..06cccd4831bfc 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp @@ -10,9 +10,9 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/XtensaMCTargetDesc.h" #include "Xtensa.h" #include "XtensaTargetMachine.h" -#include "XtensaUtils.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" @@ -75,7 +75,7 @@ class XtensaDAGToDAGISel : public SelectionDAGISel { ConstantSDNode *CN = dyn_cast(Addr.getOperand(1)); int64_t OffsetVal = CN->getSExtValue(); - Valid = isValidAddrOffset(Scale, OffsetVal); + Valid = Xtensa::isValidAddrOffset(Scale, OffsetVal); if (Valid) { // If the first operand is a FI, get the TargetFI Node. diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp index bced2d4ad0095..4a8bafc540df0 100644 --- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp @@ -11,9 +11,9 @@ //===----------------------------------------------------------------------===// #include "XtensaRegisterInfo.h" +#include "MCTargetDesc/XtensaMCTargetDesc.h" #include "XtensaInstrInfo.h" #include "XtensaSubtarget.h" -#include "XtensaUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -99,7 +99,7 @@ bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int64_t Offset = SPOffset + (int64_t)StackSize + MI.getOperand(FIOperandNum + 1).getImm(); - bool Valid = isValidAddrOffset(MI, Offset); + bool Valid = Xtensa::isValidAddrOffsetForOpcode(MI.getOpcode(), Offset); // If MI is not a debug value, make sure Offset fits in the 16-bit immediate // field. diff --git a/llvm/lib/Target/Xtensa/XtensaUtils.cpp b/llvm/lib/Target/Xtensa/XtensaUtils.cpp deleted file mode 100644 index 98e424f6ea440..0000000000000 --- a/llvm/lib/Target/Xtensa/XtensaUtils.cpp +++ /dev/null @@ -1,59 +0,0 @@ -//===--- XtensaUtils.cpp ---- Xtensa Utility Functions ----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains miscellaneous utility functions. -// -//===----------------------------------------------------------------------===// - -#include "XtensaUtils.h" - -namespace llvm { - -bool isValidAddrOffset(int Scale, int64_t OffsetVal) { - bool Valid = false; - - switch (Scale) { - case 1: - Valid = (OffsetVal >= 0 && OffsetVal <= 255); - break; - case 2: - Valid = (OffsetVal >= 0 && OffsetVal <= 510) && ((OffsetVal & 0x1) == 0); - break; - case 4: - Valid = (OffsetVal >= 0 && OffsetVal <= 1020) && ((OffsetVal & 0x3) == 0); - break; - default: - break; - } - return Valid; -} - -bool isValidAddrOffset(MachineInstr &MI, int64_t Offset) { - int Scale = 0; - - switch (MI.getOpcode()) { - case Xtensa::L8UI: - case Xtensa::S8I: - Scale = 1; - break; - case Xtensa::L16SI: - case Xtensa::L16UI: - case Xtensa::S16I: - Scale = 2; - break; - case Xtensa::LEA_ADD: - return (Offset >= -128 && Offset <= 127); - default: - // assume that MI is 32-bit load/store operation - Scale = 4; - break; - } - return isValidAddrOffset(Scale, Offset); -} - -} // namespace llvm diff --git a/llvm/lib/Target/Xtensa/XtensaUtils.h b/llvm/lib/Target/Xtensa/XtensaUtils.h deleted file mode 100644 index 2b0ac37a6971a..0000000000000 --- a/llvm/lib/Target/Xtensa/XtensaUtils.h +++ /dev/null @@ -1,27 +0,0 @@ -//===--- XtensaUtils.h ---- Xtensa Utility Functions ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains miscellaneous utility functions. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAUTILS_H -#define LLVM_LIB_TARGET_XTENSA_XTENSAUTILS_H - -#include "XtensaInstrInfo.h" -#include "llvm/CodeGen/MachineInstr.h" - -namespace llvm { -// Check address offset for load/store instructions. -// The offset should be multiple of scale. -bool isValidAddrOffset(int Scale, int64_t OffsetVal); - -// Check address offset for load/store instructions. -bool isValidAddrOffset(MachineInstr &MI, int64_t Offset); -} // namespace llvm -#endif // LLVM_LIB_TARGET_XTENSA_XTENSAUTILS_H From 37fdde6025c8ead27a7608643b63e0d4498211e2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jan 2025 22:53:10 -0800 Subject: [PATCH 130/432] [CodeGen] Remove implict conversions from Register to unsigned from MachineOperand. NFC --- llvm/include/llvm/CodeGen/MachineOperand.h | 2 +- llvm/lib/CodeGen/MachineOperand.cpp | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h index be1b4fb7d54fb..3ec46afa781ab 100644 --- a/llvm/include/llvm/CodeGen/MachineOperand.h +++ b/llvm/include/llvm/CodeGen/MachineOperand.h @@ -854,7 +854,7 @@ class MachineOperand { Op.IsEarlyClobber = isEarlyClobber; Op.TiedTo = 0; Op.IsDebug = isDebug; - Op.SmallContents.RegNo = Reg; + Op.SmallContents.RegNo = Reg.id(); Op.Contents.Reg.Prev = nullptr; Op.Contents.Reg.Next = nullptr; Op.setSubReg(SubReg); diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index d11ac614ace35..f498491164e14 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -71,13 +71,13 @@ void MachineOperand::setReg(Register Reg) { if (MachineFunction *MF = getMFIfAvailable(*this)) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.removeRegOperandFromUseList(this); - SmallContents.RegNo = Reg; + SmallContents.RegNo = Reg.id(); MRI.addRegOperandToUseList(this); return; } // Otherwise, just change the register, no problem. :) - SmallContents.RegNo = Reg; + SmallContents.RegNo = Reg.id(); } void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx, @@ -291,7 +291,7 @@ void MachineOperand::ChangeToRegister(Register Reg, bool isDef, bool isImp, assert(!(isDead && !isDef) && "Dead flag on non-def"); assert(!(isKill && isDef) && "Kill flag on def"); OpKind = MO_Register; - SmallContents.RegNo = Reg; + SmallContents.RegNo = Reg.id(); SubReg_TargetFlags = 0; IsDef = isDef; IsImp = isImp; @@ -390,7 +390,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) { switch (MO.getType()) { case MachineOperand::MO_Register: // Register operands don't have target flags. - return hash_combine(MO.getType(), (unsigned)MO.getReg(), MO.getSubReg(), MO.isDef()); + return hash_combine(MO.getType(), MO.getReg().id(), MO.getSubReg(), + MO.isDef()); case MachineOperand::MO_Immediate: return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm()); case MachineOperand::MO_CImmediate: From f46eb1430992ba1abe246dfd0b4ccf8229fe0ab7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 26 Jan 2025 00:15:32 -0800 Subject: [PATCH 131/432] [AMDGPU] Replace unsigned with Register in SIMachineScheduler. NFC Some of these may eventually need to VirtRegOrUnit. --- llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 51 +++++++++---------- llvm/lib/Target/AMDGPU/SIMachineScheduler.h | 26 +++++----- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 77b4f25021c75..b3fa65512e4c4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -284,10 +284,9 @@ void SIScheduleBlock::fastSchedule() { } // Returns if the register was set between first and last. -static bool isDefBetween(unsigned Reg, - SlotIndex First, SlotIndex Last, - const MachineRegisterInfo *MRI, - const LiveIntervals *LIS) { +static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last, + const MachineRegisterInfo *MRI, + const LiveIntervals *LIS) { for (MachineRegisterInfo::def_instr_iterator UI = MRI->def_instr_begin(Reg), UE = MRI->def_instr_end(); UI != UE; ++UI) { @@ -581,11 +580,11 @@ void SIScheduleBlock::printDebug(bool full) { << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; dbgs() << "LiveIns:\n"; - for (unsigned Reg : LiveInRegs) + for (Register Reg : LiveInRegs) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << "\nLiveOuts:\n"; - for (unsigned Reg : LiveOutRegs) + for (Register Reg : LiveOutRegs) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; } @@ -1413,12 +1412,12 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, // highest topological index. LiveOutRegsNumUsages.resize(Blocks.size()); for (SIScheduleBlock *Block : Blocks) { - for (unsigned Reg : Block->getInRegs()) { + for (Register Reg : Block->getInRegs()) { bool Found = false; int topoInd = -1; for (SIScheduleBlock* Pred: Block->getPreds()) { - std::set PredOutRegs = Pred->getOutRegs(); - std::set::iterator RegPos = PredOutRegs.find(Reg); + std::set PredOutRegs = Pred->getOutRegs(); + std::set::iterator RegPos = PredOutRegs.find(Reg); if (RegPos != PredOutRegs.end()) { Found = true; @@ -1453,18 +1452,18 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, } #endif - std::set InRegs = DAG->getInRegs(); + std::set InRegs = DAG->getInRegs(); addLiveRegs(InRegs); // Increase LiveOutRegsNumUsages for blocks // producing registers consumed in another // scheduling region. - for (unsigned Reg : DAG->getOutRegs()) { + for (Register Reg : DAG->getOutRegs()) { for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { // Do reverse traversal int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; SIScheduleBlock *Block = Blocks[ID]; - const std::set &OutRegs = Block->getOutRegs(); + const std::set &OutRegs = Block->getOutRegs(); if (OutRegs.find(Reg) == OutRegs.end()) continue; @@ -1477,11 +1476,11 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, // Fill LiveRegsConsumers for regs that were already // defined before scheduling. for (SIScheduleBlock *Block : Blocks) { - for (unsigned Reg : Block->getInRegs()) { + for (Register Reg : Block->getInRegs()) { bool Found = false; for (SIScheduleBlock* Pred: Block->getPreds()) { - std::set PredOutRegs = Pred->getOutRegs(); - std::set::iterator RegPos = PredOutRegs.find(Reg); + std::set PredOutRegs = Pred->getOutRegs(); + std::set::iterator RegPos = PredOutRegs.find(Reg); if (RegPos != PredOutRegs.end()) { Found = true; @@ -1573,13 +1572,11 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { if (SregCurrentUsage > maxSregUsage) maxSregUsage = SregCurrentUsage; LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; - for (SIScheduleBlock *Block - : ReadyBlocks) dbgs() - << Block->getID() << ' '; + for (SIScheduleBlock *Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; - for (unsigned Reg - : LiveRegs) dbgs() - << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; + for (Register Reg : LiveRegs) + dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); @@ -1634,7 +1631,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { // Tracking of currently alive registers to determine VGPR Usage. -void SIScheduleBlockScheduler::addLiveRegs(std::set &Regs) { +void SIScheduleBlockScheduler::addLiveRegs(std::set &Regs) { for (Register Reg : Regs) { // For now only track virtual registers. if (!Reg.isVirtual()) @@ -1645,10 +1642,10 @@ void SIScheduleBlockScheduler::addLiveRegs(std::set &Regs) { } void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, - std::set &Regs) { - for (unsigned Reg : Regs) { + std::set &Regs) { + for (Register Reg : Regs) { // For now only track virtual registers. - std::set::iterator Pos = LiveRegs.find(Reg); + std::set::iterator Pos = LiveRegs.find(Reg); assert (Pos != LiveRegs.end() && // Reg must be live. LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1); @@ -1687,8 +1684,8 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { } std::vector -SIScheduleBlockScheduler::checkRegUsageImpact(std::set &InRegs, - std::set &OutRegs) { +SIScheduleBlockScheduler::checkRegUsageImpact(std::set &InRegs, + std::set &OutRegs) { std::vector DiffSetPressure; DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index f8f4b5aae338e..b219cbd5672f0 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -81,8 +81,8 @@ class SIScheduleBlock { // Note that some registers are not 32 bits, // and thus the pressure is not equal // to the number of live registers. - std::set LiveInRegs; - std::set LiveOutRegs; + std::set LiveInRegs; + std::set LiveOutRegs; bool Scheduled = false; bool HighLatencyBlock = false; @@ -157,8 +157,8 @@ class SIScheduleBlock { return InternalAdditionalPressure; } - std::set &getInRegs() { return LiveInRegs; } - std::set &getOutRegs() { return LiveOutRegs; } + std::set &getInRegs() { return LiveInRegs; } + std::set &getOutRegs() { return LiveOutRegs; } void printDebug(bool Full); @@ -320,10 +320,10 @@ class SIScheduleBlockScheduler { SISchedulerBlockSchedulerVariant Variant; std::vector Blocks; - std::vector> LiveOutRegsNumUsages; - std::set LiveRegs; + std::vector> LiveOutRegsNumUsages; + std::set LiveRegs; // Num of schedulable unscheduled blocks reading the register. - std::map LiveRegsConsumers; + std::map LiveRegsConsumers; std::vector LastPosHighLatencyParentScheduled; int LastPosWaitedHighLatency; @@ -389,15 +389,15 @@ class SIScheduleBlockScheduler { SIBlockSchedCandidate &TryCand); SIScheduleBlock *pickBlock(); - void addLiveRegs(std::set &Regs); - void decreaseLiveRegs(SIScheduleBlock *Block, std::set &Regs); + void addLiveRegs(std::set &Regs); + void decreaseLiveRegs(SIScheduleBlock *Block, std::set &Regs); void releaseBlockSuccs(SIScheduleBlock *Parent); void blockScheduled(SIScheduleBlock *Block); // Check register pressure change // by scheduling a block with these LiveIn and LiveOut. - std::vector checkRegUsageImpact(std::set &InRegs, - std::set &OutRegs); + std::vector checkRegUsageImpact(std::set &InRegs, + std::set &OutRegs); void schedule(); }; @@ -462,8 +462,8 @@ class SIScheduleDAGMI final : public ScheduleDAGMILive { unsigned &VgprUsage, unsigned &SgprUsage); - std::set getInRegs() { - std::set InRegs; + std::set getInRegs() { + std::set InRegs; for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { InRegs.insert(RegMaskPair.RegUnit); } From ab895ad2bfb6835e8c47d8e616edb6cadaf59b77 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 26 Jan 2025 01:34:41 -0800 Subject: [PATCH 132/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#124446) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Pattern to be nonnull. --- clang/lib/AST/DeclTemplate.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index e4cb7dcb16a45..de81bc64106f1 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -1463,7 +1463,7 @@ SourceRange VarTemplateSpecializationDecl::getSourceRange() const { assert(!Pattern.isNull() && "Variable template specialization without pattern?"); if (const auto *VTPSD = - Pattern.dyn_cast()) + dyn_cast(Pattern)) return VTPSD->getSourceRange(); VarTemplateDecl *VTD = cast(Pattern); if (hasInit()) { From f09a6f632584c2b34f8f2d048a5420b040bb1005 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 26 Jan 2025 01:34:59 -0800 Subject: [PATCH 133/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC) (#124447) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect AnyFunc to be nonnull. --- clang/lib/Sema/SemaAPINotes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp index 4f79775bc5e91..b354bb7b06435 100644 --- a/clang/lib/Sema/SemaAPINotes.cpp +++ b/clang/lib/Sema/SemaAPINotes.cpp @@ -478,7 +478,7 @@ static void ProcessAPINotes(Sema &S, FunctionOrMethod AnyFunc, const api_notes::FunctionInfo &Info, VersionedInfoMetadata Metadata) { // Find the declaration itself. - FunctionDecl *FD = AnyFunc.dyn_cast(); + FunctionDecl *FD = dyn_cast(AnyFunc); Decl *D = FD; ObjCMethodDecl *MD = nullptr; if (!D) { From 850852e9a45f7883bd1a04c2a6b9fceb6dcdaba2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 26 Jan 2025 01:35:39 -0800 Subject: [PATCH 134/432] [CodeGen] Avoid repeated hash lookups (NFC) (#124455) --- llvm/lib/CodeGen/InlineSpiller.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index d98254650a001..33915d0f7f829 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -1578,7 +1578,8 @@ void HoistSpillHelper::runHoistSpills( for (auto *const SpillBB : SpillsInSubTree) { // When SpillBB is a BB contains original spill, insert the spill // to SpillsToRm. - if (SpillsToKeep.contains(SpillBB) && !SpillsToKeep[SpillBB]) { + if (auto It = SpillsToKeep.find(SpillBB); + It != SpillsToKeep.end() && !It->second) { MachineInstr *SpillToRm = SpillBBToSpill[SpillBB]; SpillsToRm.push_back(SpillToRm); } From 8035d38daab028b8da3cf2b01090b5f0ceacd695 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Sun, 26 Jan 2025 09:44:04 +0000 Subject: [PATCH 135/432] [Flang][OpenMP]Add parsing support for DISPATCH construct (#121982) This allows the Flang parser to accept the !$OMP DISPATCH and related clauses. Lowering is currently not implemented. Tests for unparse and parse-tree dump is provided, and one for checking that the lowering ends in a "not yet implemented" --------- Co-authored-by: Kiran Chandramohan --- flang/include/flang/Parser/dump-parse-tree.h | 3 ++ flang/include/flang/Parser/parse-tree.h | 31 ++++++++++-- flang/lib/Lower/OpenMP/OpenMP.cpp | 10 ++++ flang/lib/Parser/openmp-parsers.cpp | 15 ++++++ flang/lib/Parser/unparse.cpp | 9 ++++ flang/lib/Semantics/check-omp-structure.cpp | 30 ++++++++++++ flang/lib/Semantics/check-omp-structure.h | 2 + flang/lib/Semantics/resolve-directives.cpp | 8 +++ flang/test/Lower/OpenMP/Todo/dispatch.f90 | 12 +++++ flang/test/Parser/OpenMP/dispatch.f90 | 51 ++++++++++++++++++++ flang/test/Semantics/OpenMP/dispatch.f90 | 24 +++++++++ 11 files changed, 192 insertions(+), 3 deletions(-) create mode 100644 flang/test/Lower/OpenMP/Todo/dispatch.f90 create mode 100644 flang/test/Parser/OpenMP/dispatch.f90 create mode 100644 flang/test/Semantics/OpenMP/dispatch.f90 diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 11725991e9c9a..a501ae658a382 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -679,6 +679,9 @@ class ParseTreeDumper { NODE_ENUM(common, OmpAtomicDefaultMemOrderType) NODE(parser, OpenMPDepobjConstruct) NODE(parser, OpenMPUtilityConstruct) + NODE(parser, OpenMPDispatchConstruct) + NODE(parser, OmpDispatchDirective) + NODE(parser, OmpEndDispatchDirective) NODE(parser, OpenMPFlushConstruct) NODE(parser, OpenMPLoopConstruct) NODE(parser, OpenMPExecutableAllocate) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 00d85aa05fb3a..78962db8a84de 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4685,6 +4685,31 @@ struct OpenMPDepobjConstruct { std::tuple t; }; +// Ref: [5.2: 200-201] +// +// dispatch-construct -> DISPATCH dispatch-clause +// dispatch-clause -> depend-clause | +// device-clause | +// is_device_ptr-clause | +// nocontext-clause | +// novariants-clause | +// nowait-clause +struct OmpDispatchDirective { + TUPLE_CLASS_BOILERPLATE(OmpDispatchDirective); + CharBlock source; + std::tuple t; +}; + +EMPTY_CLASS(OmpEndDispatchDirective); + +struct OpenMPDispatchConstruct { + TUPLE_CLASS_BOILERPLATE(OpenMPDispatchConstruct); + CharBlock source; + std::tuple> + t; +}; + // 2.17.8 flush -> FLUSH [memory-order-clause] [(variable-name-list)] struct OpenMPFlushConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPFlushConstruct); @@ -4757,9 +4782,9 @@ struct OpenMPConstruct { UNION_CLASS_BOILERPLATE(OpenMPConstruct); std::variant + OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct, + OpenMPUtilityConstruct, OpenMPExecutableAllocate, + OpenMPAllocatorsConstruct, OpenMPCriticalConstruct> u; }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 1434bcd6330e0..7c8d292e90f01 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -381,6 +381,9 @@ extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) { [](const parser::OpenMPDeclarativeAllocate &c) { return llvm::omp::OMPD_allocate; }, + [](const parser::OpenMPDispatchConstruct &c) { + return llvm::omp::OMPD_dispatch; + }, [](const parser::OpenMPExecutableAllocate &c) { return llvm::omp::OMPD_allocate; }, @@ -3388,6 +3391,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct"); } +static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + const parser::OpenMPDispatchConstruct &) { + TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct"); +} + static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 5ff91da082c85..aa2fec01bc640 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -740,11 +740,15 @@ TYPE_PARSER( "MERGEABLE" >> construct(construct()) || "MESSAGE" >> construct(construct( parenthesized(Parser{}))) || + "NOCONTEXT" >> construct(construct( + parenthesized(scalarLogicalExpr))) || "NOGROUP" >> construct(construct()) || "NONTEMPORAL" >> construct(construct( parenthesized(nonemptyList(name)))) || "NOTINBRANCH" >> construct(construct()) || + "NOVARIANTS" >> construct(construct( + parenthesized(scalarLogicalExpr))) || "NOWAIT" >> construct(construct()) || "NUM_TASKS" >> construct(construct( parenthesized(Parser{}))) || @@ -1119,6 +1123,16 @@ TYPE_PARSER(sourced(construct(verbatim("CRITICAL"_tok), TYPE_PARSER(construct( Parser{}, block, Parser{})) +TYPE_PARSER(sourced(construct( + verbatim("DISPATCH"_tok), Parser{}))) + +TYPE_PARSER( + construct(startOmpLine >> "END DISPATCH"_tok)) + +TYPE_PARSER(sourced(construct( + Parser{} / endOmpLine, block, + maybe(Parser{} / endOmpLine)))) + // 2.11.3 Executable Allocate directive TYPE_PARSER( sourced(construct(verbatim("ALLOCATE"_tok), @@ -1219,6 +1233,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, construct(Parser{}), construct(Parser{}), construct(Parser{}), + construct(Parser{}), construct(Parser{}), construct(Parser{}), construct(Parser{}), diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 7bf404bba2c3e..5b1ff07382c4d 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2725,6 +2725,15 @@ class UnparseVisitor { Walk(x.v); return false; } + void Unparse(const OmpDispatchDirective &x) { + Word("!$OMP DISPATCH"); + Walk(x.t); + Put("\n"); + } + void Unparse(const OmpEndDispatchDirective &) { + Word("!$OMP END DISPATCH"); + Put("\n"); + } void Unparse(const OmpErrorDirective &x) { Word("!$OMP ERROR "); Walk(x.t); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index d3f2d3fd2f9dc..c7ad9cc085a21 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1748,6 +1748,36 @@ void OmpStructureChecker::Enter(const parser::OmpErrorDirective &x) { PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_error); } +void OmpStructureChecker::Enter(const parser::OpenMPDispatchConstruct &x) { + PushContextAndClauseSets(x.source, llvm::omp::Directive::OMPD_dispatch); + const auto &block{std::get(x.t)}; + if (block.empty() || block.size() > 1) { + context_.Say(x.source, + "The DISPATCH construct is empty or contains more than one statement"_err_en_US); + return; + } + + auto it{block.begin()}; + bool passChecks{false}; + if (const parser::AssignmentStmt * + assignStmt{parser::Unwrap(*it)}) { + if (parser::Unwrap(assignStmt->t)) { + passChecks = true; + } + } else if (parser::Unwrap(*it)) { + passChecks = true; + } + + if (!passChecks) { + context_.Say(x.source, + "The DISPATCH construct does not contain a SUBROUTINE or FUNCTION"_err_en_US); + } +} + +void OmpStructureChecker::Leave(const parser::OpenMPDispatchConstruct &x) { + dirContext_.pop_back(); +} + void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) { dirContext_.pop_back(); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index dc360957c873b..2b8304cb17037 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -105,6 +105,8 @@ class OmpStructureChecker void Enter(const parser::OmpDeclareTargetWithList &); void Enter(const parser::OmpDeclareTargetWithClause &); void Leave(const parser::OmpDeclareTargetWithClause &); + void Enter(const parser::OpenMPDispatchConstruct &); + void Leave(const parser::OpenMPDispatchConstruct &); void Enter(const parser::OmpErrorDirective &); void Leave(const parser::OmpErrorDirective &); void Enter(const parser::OpenMPExecutableAllocate &); diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index ea102371334a6..4e6d819f545a2 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -441,6 +441,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPDeclarativeAllocate &); void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); } + bool Pre(const parser::OpenMPDispatchConstruct &); + void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPExecutableAllocate &); void Post(const parser::OpenMPExecutableAllocate &); @@ -1976,6 +1979,11 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) { return false; } +bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) { + PushContext(x.source, llvm::omp::Directive::OMPD_dispatch); + return true; +} + bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) { PushContext(x.source, llvm::omp::Directive::OMPD_allocate); const auto &list{std::get>(x.t)}; diff --git a/flang/test/Lower/OpenMP/Todo/dispatch.f90 b/flang/test/Lower/OpenMP/Todo/dispatch.f90 new file mode 100644 index 0000000000000..380dfa14eaae1 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/dispatch.f90 @@ -0,0 +1,12 @@ +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: OpenMPDispatchConstruct +program p + integer r + r = 1 +!$omp dispatch nowait + call foo() +contains + subroutine foo + end subroutine +end program p diff --git a/flang/test/Parser/OpenMP/dispatch.f90 b/flang/test/Parser/OpenMP/dispatch.f90 new file mode 100644 index 0000000000000..98cd6090334f3 --- /dev/null +++ b/flang/test/Parser/OpenMP/dispatch.f90 @@ -0,0 +1,51 @@ +! RUN: %flang_fc1 -fopenmp -fdebug-dump-parse-tree %s | FileCheck %s +! RUN: %flang_fc1 -fopenmp -fdebug-unparse %s | FileCheck %s --check-prefix="UNPARSE" + +integer function func(a, b, c) + integer :: a, b, c + func = a + b + c +end function func + +subroutine sub(x) + use iso_c_binding + integer :: func + integer :: r + type(c_ptr) :: x + integer :: a = 14, b = 7, c = 21 +!UNPARSE: !$OMP DISPATCH DEVICE(3_4) NOWAIT NOCONTEXT(.false._4) NOVARIANTS(.true._4) +!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct +!CHECK-NEXT: | | | OmpDispatchDirective +!CHECK: | | | | OmpClauseList -> OmpClause -> Device -> OmpDeviceClause +!CHECK-NEXT: | | | | | Scalar -> Integer -> Expr = '3_4' +!CHECK-NEXT: | | | | | | LiteralConstant -> IntLiteralConstant = '3' +!CHECK-NEXT: | | | | OmpClause -> Nowait +!CHECK-NEXT: | | | | OmpClause -> Nocontext -> Scalar -> Logical -> Expr = '.false._4' +!CHECK-NEXT: | | | | | LiteralConstant -> LogicalLiteralConstant +!CHECK-NEXT: | | | | | | bool = 'false' +!CHECK-NEXT: | | | | OmpClause -> Novariants -> Scalar -> Logical -> Expr = '.true._4' +!CHECK-NEXT: | | | | | EQ +!CHECK-NEXT: | | | | | | Expr = '1_4' +!CHECK-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' +!CHECK-NEXT: | | | | | | Expr = '1_4' +!CHECK-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' +!CHECK-NEXT: | | | Block + + !$omp dispatch device(3) nowait nocontext(.false.) novariants(1.eq.1) + r = func(a, b, c) +!UNPARSE: !$OMP END DISPATCH +!CHECK: | | | OmpEndDispatchDirective + !$omp end dispatch + +!! Test the "no end dispatch" option. +!UNPARSE: !$OMP DISPATCH DEVICE(3_4) IS_DEVICE_PTR(x) +!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct +!CHECK-NEXT: | | | OmpDispatchDirective +!CHECK: | | | | OmpClause -> IsDevicePtr -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x' + !$omp dispatch device(3) is_device_ptr(x) + r = func(a+1, b+2, c+3) +!CHECK-NOT: | | | OmpEndDispatchDirective + +end subroutine sub + + + diff --git a/flang/test/Semantics/OpenMP/dispatch.f90 b/flang/test/Semantics/OpenMP/dispatch.f90 new file mode 100644 index 0000000000000..7dfbeecb2fc1d --- /dev/null +++ b/flang/test/Semantics/OpenMP/dispatch.f90 @@ -0,0 +1,24 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenmp + +subroutine sb1 + integer :: r + r = 1 + !ERROR: The DISPATCH construct does not contain a SUBROUTINE or FUNCTION + !$omp dispatch nowait + print *,r +end subroutine +subroutine sb2 + integer :: r +!ERROR: The DISPATCH construct is empty or contains more than one statement + !$omp dispatch + call foo() + r = bar() + !$omp end dispatch +contains + subroutine foo + end subroutine foo + function bar + integer :: bar + bar = 2 + end function +end subroutine From 81d38da65e336dfb023df89f1bdc32633ad05fb2 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 26 Jan 2025 13:46:00 +0000 Subject: [PATCH 136/432] [LV] Add more tests for narrowing interleave groups for AArch64. Add additional tests for https://github.com/llvm/llvm-project/pull/106441. --- ...sform-narrow-interleave-to-widen-memory.ll | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll new file mode 100644 index 0000000000000..3fca274a3bb12 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll @@ -0,0 +1,205 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=CHECK %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-apple-macosx15.0.0" + +define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) { +; CHECK-LABEL: define void @test_complex_add_float( +; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV1]] +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV1]] +; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_2]], i64 4 +; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_2]], i64 4 +; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV1]] +; CHECK-NEXT: store float [[ADD_0]], ptr [[GEP_RES_0]], align 4 +; CHECK-NEXT: [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 4 +; CHECK-NEXT: store float [[ADD_1]], ptr [[GEP_RES_1]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A.0 = getelementptr inbounds nuw { float, float }, ptr %A, i64 %iv + %gep.B.0 = getelementptr inbounds nuw { float, float }, ptr %B, i64 %iv + %l.A.0 = load float, ptr %gep.A.0, align 4 + %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 4 + %l.A.1 = load float, ptr %gep.A.1, align 4 + %l.B.0 = load float, ptr %gep.B.0, align 4 + %add.0 = fadd float %l.A.0, %l.B.0 + %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 4 + %l.B.1 = load float, ptr %gep.B.1, align 4 + %add.1 = fadd float %l.A.1, %l.B.1 + %gep.res.0 = getelementptr inbounds nuw { float, float }, ptr %res, i64 %iv + store float %add.0, ptr %gep.res.0, align 4 + %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 4 + store float %add.1, ptr %gep.res.1, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) { +; CHECK-LABEL: define void @test_complex_add_double( +; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_0]], i64 8 +; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd double [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_0]], i64 8 +; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd double [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[IV]] +; CHECK-NEXT: store double [[ADD_0]], ptr [[GEP_RES_0]], align 4 +; CHECK-NEXT: [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 8 +; CHECK-NEXT: store double [[ADD_1]], ptr [[GEP_RES_1]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv + %gep.B.0 = getelementptr inbounds nuw { double, double }, ptr %B, i64 %iv + %l.A.0 = load double, ptr %gep.A.0, align 4 + %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8 + %l.A.1 = load double, ptr %gep.A.1, align 4 + %l.B.0 = load double, ptr %gep.B.0, align 4 + %add.0 = fadd double %l.A.0, %l.B.0 + %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 8 + %l.B.1 = load double, ptr %gep.B.1, align 4 + %add.1 = fadd double %l.A.1, %l.B.1 + %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv + store double %add.0, ptr %gep.res.0, align 4 + %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8 + store double %add.1, ptr %gep.res.1, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} From dec47b76f406242dfb9d36da4d7adfb171c71104 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 26 Jan 2025 14:43:51 +0000 Subject: [PATCH 137/432] [CostModel][X86] Update baseline CTTZ/CTLZ costs for x86_64 (#124312) Followup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 10 +- .../Analysis/CostModel/X86/ctlz-codesize.ll | 8 +- .../CostModel/X86/ctlz-sizelatency.ll | 8 +- llvm/test/Analysis/CostModel/X86/ctlz.ll | 4 +- .../Analysis/CostModel/X86/cttz-codesize.ll | 4 +- .../CostModel/X86/cttz-sizelatency.ll | 8 +- llvm/test/Analysis/CostModel/X86/cttz.ll | 4 +- .../CostModel/X86/intrinsic-cost-kinds.ll | 6 +- .../test/Transforms/SLPVectorizer/X86/ctlz.ll | 172 +++++++++++------- .../test/Transforms/SLPVectorizer/X86/cttz.ll | 74 +++++++- 10 files changed, 200 insertions(+), 98 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d3c923a76d074..cdc2ce752743c 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, - { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR + { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR + { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR + { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR - { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH + { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF + { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF + { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF + { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll index da0f71c63ef80..9f8e4edf7a0fc 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll index 2425e7286265b..fc3516695852a 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll index fa7982ce09e9c..d9d04de12467d 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll index 07bf1dd7a2ff6..621c1b9320fc8 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) { define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll index afe5cb8c55fe6..34d363ce00879 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) { define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' @@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) { define i16 @var_cttz_i16(i16 %a) { ; NOBMI-LABEL: 'var_cttz_i16' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz ; ; BMI-LABEL: 'var_cttz_i16' @@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) { define i8 @var_cttz_i8(i8 %a) { ; NOBMI-LABEL: 'var_cttz_i8' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz ; ; BMI-LABEL: 'var_cttz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll index fa0f10f886f63..3f5a731b27d9b 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) { define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll index 062e5f157bae2..bcef47ee9e056 100644 --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { define void @cttz(i32 %a, <16 x i32> %va) { ; THRU-LABEL: 'cttz' -; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; THRU-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) { ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'cttz' -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'cttz' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll index 8a22e45fe1ca5..9bf2ade3176d6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 @@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 { } define void @ctlz_4i32() #0 { -; SSE2-LABEL: @ctlz_4i32( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; SSE2-NEXT: ret void +; SSE-LABEL: @ctlz_4i32( +; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 +; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @ctlz_4i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 +; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; AVX1-NEXT: ret void ; -; SSE4-LABEL: @ctlz_4i32( -; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 -; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 -; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 -; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 -; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 -; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 -; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 -; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 -; SSE4-NEXT: ret void +; AVX2-LABEL: @ctlz_4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX2-NEXT: ret void ; -; AVX-LABEL: @ctlz_4i32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; AVX-NEXT: ret void +; AVX512-LABEL: @ctlz_4i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 4 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 @@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 { } define void @ctlz_8i32() #0 { -; SSE2-LABEL: @ctlz_8i32( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2 -; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2 -; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 -; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) -; SSE2-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 -; SSE2-NEXT: ret void +; SSE-LABEL: @ctlz_8i32( +; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; SSE-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; SSE-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; SSE-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; SSE-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) +; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) +; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) +; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) +; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 +; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; SSE-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; SSE-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; SSE-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; SSE-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @ctlz_8i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; AVX1-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; AVX1-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; AVX1-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; AVX1-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) +; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) +; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) +; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) +; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 +; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; AVX1-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; AVX1-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; AVX1-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; AVX1-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: ret void ; -; SSE4-LABEL: @ctlz_8i32( -; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 -; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 -; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 -; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 -; SSE4-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 -; SSE4-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 -; SSE4-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 -; SSE4-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 -; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; SSE4-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) -; SSE4-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) -; SSE4-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) -; SSE4-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) -; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 -; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 -; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 -; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 -; SSE4-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 -; SSE4-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 -; SSE4-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 -; SSE4-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 -; SSE4-NEXT: ret void +; AVX2-LABEL: @ctlz_8i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX2-NEXT: ret void ; -; AVX-LABEL: @ctlz_8i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 -; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 -; AVX-NEXT: ret void +; AVX512-LABEL: @ctlz_8i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 2 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 @@ -1063,3 +1102,6 @@ define void @ctlz_undef_32i8() #0 { } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE2: {{.*}} +; SSE4: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll index 22f0c3f936509..896be6f2fe213 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll @@ -142,11 +142,32 @@ define void @cttz_4i32() #0 { ; SSE-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @cttz_4i32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @cttz_4i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: store i32 [[CTTZ0]], ptr @dst32, align 4 +; AVX1-NEXT: store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; AVX1-NEXT: store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @cttz_4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @cttz_4i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 4 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 @@ -191,11 +212,44 @@ define void @cttz_8i32() #0 { ; SSE-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 ; SSE-NEXT: ret void ; -; AVX-LABEL: @cttz_8i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 -; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 -; AVX-NEXT: ret void +; AVX1-LABEL: @cttz_8i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; AVX1-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; AVX1-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; AVX1-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; AVX1-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false) +; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) +; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) +; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) +; AVX1-NEXT: store i32 [[CTTZ0]], ptr @dst32, align 2 +; AVX1-NEXT: store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; AVX1-NEXT: store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; AVX1-NEXT: store i32 [[CTTZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; AVX1-NEXT: store i32 [[CTTZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; AVX1-NEXT: store i32 [[CTTZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; AVX1-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @cttz_8i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @cttz_8i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 2 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 From e4514293f99962b47d881d5b40722c6b56a1f425 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Sun, 26 Jan 2025 16:17:21 +0100 Subject: [PATCH 138/432] [Clang] Correctly determine constexprness of dependent lambdas. (#124468) We skipped checking if a lambda is constexpr if the parent context was dependent, even if the lambda itself wasn't (and there is no other opportunity to establish constexprness) Fixes #114234 Fixes #97958 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaLambda.cpp | 14 +++++------ .../test/SemaCXX/cxx1z-constexpr-lambdas.cpp | 24 +++++++++++++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e9fffddd507c6..b1238db758845 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -991,6 +991,7 @@ Bug Fixes to C++ Support - Fixed assertions or false compiler diagnostics in the case of C++ modules for lambda functions or inline friend functions defined inside templates (#GH122493). - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423) +- Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index 87b3ca53cefaf..ceb32ee15dfa3 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -2239,18 +2239,18 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc, Cleanup.mergeFrom(LambdaCleanup); - LambdaExpr *Lambda = LambdaExpr::Create(Context, Class, IntroducerRange, - CaptureDefault, CaptureDefaultLoc, - ExplicitParams, ExplicitResultType, - CaptureInits, EndLoc, - ContainsUnexpandedParameterPack); + LambdaExpr *Lambda = + LambdaExpr::Create(Context, Class, IntroducerRange, CaptureDefault, + CaptureDefaultLoc, ExplicitParams, ExplicitResultType, + CaptureInits, EndLoc, ContainsUnexpandedParameterPack); + // If the lambda expression's call operator is not explicitly marked constexpr - // and we are not in a dependent context, analyze the call operator to infer + // and is not dependent, analyze the call operator to infer // its constexpr-ness, suppressing diagnostics while doing so. if (getLangOpts().CPlusPlus17 && !CallOperator->isInvalidDecl() && !CallOperator->isConstexpr() && !isa(CallOperator->getBody()) && - !Class->getDeclContext()->isDependentContext()) { + !Class->isDependentContext()) { CallOperator->setConstexprKind( CheckConstexprFunctionDefinition(CallOperator, CheckConstexprKind::CheckValid) diff --git a/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp b/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp index 6a1f48bf7958f..0c20dd9dc58c6 100644 --- a/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp +++ b/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp @@ -349,3 +349,27 @@ static_assert(OtherCaptures(), ""); } // namespace PR36054 #endif // ndef CPP14_AND_EARLIER + + +#if __cpp_constexpr >= 201907L +namespace GH114234 { +template +auto g() { return Arg; } + +template +auto f() { + []() { + g<[] { return 123; }()>(); + }.template operator()(); +} + +void test() { f(); } +} + +namespace GH97958 { +static_assert( + []() -> decltype([]{ return true; }) + { return {}; }()()); +} + +#endif From 0c784851c50b6b5b844e6a1f21bbe73efac332d4 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Sun, 26 Jan 2025 15:32:46 +0000 Subject: [PATCH 139/432] [MathExtras] Favor using the hexadecimal FP constants (#123180) This just fixes a TODO now that we are using C++17. --- llvm/include/llvm/Support/MathExtras.h | 61 +++++++++++++------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h index 574e9a6116603..5a6f51adc07f3 100644 --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -43,38 +43,37 @@ using common_sint = /// Mathematical constants. namespace numbers { // TODO: Track C++20 std::numbers. -// TODO: Favor using the hexadecimal FP constants (requires C++17). // clang-format off -constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145769P+1) https://oeis.org/A001113 - egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 - ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 - ln10 = 2.3025850929940456840, // (0x1.26bb1bbb55516P+1) https://oeis.org/A002392 - log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) - log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) - pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 - inv_pi = .31830988618379067154, // (0x1.45f306dc9c883P-2) https://oeis.org/A049541 - sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 - inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 - sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 - inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) - sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 - inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) - phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 -constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 - egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 - ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 - ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 - log2ef = 1.44269504F, // (0x1.715476P+0) - log10ef = .434294482F, // (0x1.bcb7b2P-2) - pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 - inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 - sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 - inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 - sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 - inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) - sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 - inv_sqrt3f = .577350269F, // (0x1.279a74P-1) - phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 +constexpr double e = 0x1.5bf0a8b145769P+1, // (2.7182818284590452354) https://oeis.org/A001113 + egamma = 0x1.2788cfc6fb619P-1, // (.57721566490153286061) https://oeis.org/A001620 + ln2 = 0x1.62e42fefa39efP-1, // (.69314718055994530942) https://oeis.org/A002162 + ln10 = 0x1.26bb1bbb55516P+1, // (2.3025850929940456840) https://oeis.org/A002392 + log2e = 0x1.71547652b82feP+0, // (1.4426950408889634074) + log10e = 0x1.bcb7b1526e50eP-2, // (.43429448190325182765) + pi = 0x1.921fb54442d18P+1, // (3.1415926535897932385) https://oeis.org/A000796 + inv_pi = 0x1.45f306dc9c883P-2, // (.31830988618379067154) https://oeis.org/A049541 + sqrtpi = 0x1.c5bf891b4ef6bP+0, // (1.7724538509055160273) https://oeis.org/A002161 + inv_sqrtpi = 0x1.20dd750429b6dP-1, // (.56418958354775628695) https://oeis.org/A087197 + sqrt2 = 0x1.6a09e667f3bcdP+0, // (1.4142135623730950488) https://oeis.org/A00219 + inv_sqrt2 = 0x1.6a09e667f3bcdP-1, // (.70710678118654752440) + sqrt3 = 0x1.bb67ae8584caaP+0, // (1.7320508075688772935) https://oeis.org/A002194 + inv_sqrt3 = 0x1.279a74590331cP-1, // (.57735026918962576451) + phi = 0x1.9e3779b97f4a8P+0; // (1.6180339887498948482) https://oeis.org/A001622 +constexpr float ef = 0x1.5bf0a8P+1F, // (2.71828183) https://oeis.org/A001113 + egammaf = 0x1.2788d0P-1F, // (.577215665) https://oeis.org/A001620 + ln2f = 0x1.62e430P-1F, // (.693147181) https://oeis.org/A002162 + ln10f = 0x1.26bb1cP+1F, // (2.30258509) https://oeis.org/A002392 + log2ef = 0x1.715476P+0F, // (1.44269504) + log10ef = 0x1.bcb7b2P-2F, // (.434294482) + pif = 0x1.921fb6P+1F, // (3.14159265) https://oeis.org/A000796 + inv_pif = 0x1.45f306P-2F, // (.318309886) https://oeis.org/A049541 + sqrtpif = 0x1.c5bf8aP+0F, // (1.77245385) https://oeis.org/A002161 + inv_sqrtpif = 0x1.20dd76P-1F, // (.564189584) https://oeis.org/A087197 + sqrt2f = 0x1.6a09e6P+0F, // (1.41421356) https://oeis.org/A002193 + inv_sqrt2f = 0x1.6a09e6P-1F, // (.707106781) + sqrt3f = 0x1.bb67aeP+0F, // (1.73205081) https://oeis.org/A002194 + inv_sqrt3f = 0x1.279a74P-1F, // (.577350269) + phif = 0x1.9e377aP+0F; // (1.61803399) https://oeis.org/A001622 // clang-format on } // namespace numbers From 33ad474c45e6d7a0de7bc75e15e27cf6cb9ff895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Sainz=20de=20Baranda=20y=20Go=C3=B1i?= Date: Sun, 26 Jan 2025 16:48:42 +0100 Subject: [PATCH 140/432] [Clang] Add predefined macros for integer constants (#123514) This adds predefined macros for integer constants to implement section 7.18.4 of ISO/IEC 9899:1999 in `` in a safe way: ``` __INT8_C(c) __INT16_C(c) __INT32_C(c) __INT64_C(c) __INTMAX_C(c) __UINT8_C(c) __UINT16_C(c) __UINT32_C(c) __UINT64_C(c) __UINTMAX_C(c) ``` Which improves compatibility with GCC and makes it trivial to implement section 7.18.4 of ISO/IEC 9899:1999. Clang defines `__INT_C_SUFFIX__`, `__UINT_C_SUFFIX__`, `__INTAX_C_SUFFIX__` and `__UINTMAX_C_SUFFIX__`, but these macros are useless for this purpose. Let's say, for example, that `__INT64_C_SUFFIX__` expands to `L` or `LL`. If the user defines them as a macros, the compiler will produce errors if `INT64_C` is implemented in `` using `__INT64_C_SUFFIX__`: **minimal-test.c:** ```cpp #if defined(__clang__) & !defined(__INT64_C) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wreserved-identifier" # define __PSTDC_INT_C_(literal, suffix) literal##suffix # define __PSTDC_INT_C(literal, suffix) __PSTDC_INT_C_(literal, suffix) # define INT64_C(literal) __PSTDC_INT_C(literal, __INT64_C_SUFFIX__) # pragma clang diagnostic pop #elif defined(__GNUC__) # define INT64_C __INT64_C #endif typedef __INT64_TYPE__ int64_t; #define L "Make Clang produce an error" #define LL "Make Clang produce an error" int main(int argc, char **argv) { (void)argc; (void)argv; int64_t v = INT64_C(9223372036854775807); (void)v; return 0; } ``` imagen **test.c:** ```cpp #if defined(__clang__) && !defined(__INT8_C) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wreserved-identifier" # define __PSTDC_INT_C_(literal, suffix) literal##suffix # define __PSTDC_INT_C(literal, suffix) __PSTDC_INT_C_(literal, suffix) # define INT8_C(literal) __PSTDC_INT_C(literal, __INT8_C_SUFFIX__) # define INT16_C(literal) __PSTDC_INT_C(literal, __INT16_C_SUFFIX__) # define INT32_C(literal) __PSTDC_INT_C(literal, __INT32_C_SUFFIX__) # define INT64_C(literal) __PSTDC_INT_C(literal, __INT64_C_SUFFIX__) # define INTMAX_C(literal) __PSTDC_INT_C(literal, __INTMAX_C_SUFFIX__) # define UINT8_C(literal) __PSTDC_INT_C(literal, __UINT8_C_SUFFIX__) # define UINT16_C(literal) __PSTDC_INT_C(literal, __UINT16_C_SUFFIX__) # define UINT32_C(literal) __PSTDC_INT_C(literal, __UINT32_C_SUFFIX__) # define UINT64_C(literal) __PSTDC_INT_C(literal, __UINT64_C_SUFFIX__) # define UINTMAX_C(literal) __PSTDC_INT_C(literal, __UINTMAX_C_SUFFIX__) # pragma clang diagnostic pop #else # define INT8_C __INT8_C # define INT16_C __INT16_C # define INT32_C __INT32_C # define INT64_C __INT64_C # define INTMAX_C __INTMAX_C # define UINT8_C __UINT8_C # define UINT16_C __UINT16_C # define UINT32_C __UINT32_C # define UINT64_C __UINT64_C # define UINTMAX_C __UINTMAX_C #endif typedef __INT8_TYPE__ int8_t; typedef __INT16_TYPE__ int16_t; typedef __INT32_TYPE__ int32_t; typedef __INT64_TYPE__ int64_t; typedef __INTMAX_TYPE__ intmax_t; typedef __UINT8_TYPE__ uint8_t; typedef __UINT16_TYPE__ uint16_t; typedef __UINT32_TYPE__ uint32_t; typedef __UINT64_TYPE__ uint64_t; typedef __UINTMAX_TYPE__ uintmax_t; #define L "Make Clang produce an error" #define LL "Make Clang produce an error" #define U "Make Clang produce an error" #define UL "Make Clang produce an error" #define ULL "Make Clang produce an error" int main(int argc, char **argv) { (void)argc; (void)argv; int8_t a = INT8_C (127); int16_t b = INT16_C (32767); int32_t c = INT32_C (2147483647); int64_t d = INT64_C (9223372036854775807); intmax_t e = INTMAX_C (9223372036854775807); uint8_t f = UINT8_C (255); uint16_t g = UINT16_C (65535); uint32_t h = UINT32_C (4294967295); uint64_t i = UINT64_C (18446744073709551615); uintmax_t j = UINTMAX_C(18446744073709551615); (void)a; (void)b; (void)c; (void)d; (void)e; (void)f; (void)g; (void)h; (void)i; (void)j; return 0; } ``` --- clang/docs/ReleaseNotes.rst | 11 +++ clang/lib/Frontend/InitPreprocessor.cpp | 14 ++- clang/test/Preprocessor/init-aarch64.c | 35 ++++++++ clang/test/Preprocessor/init-arm.c | 71 +++++++++++++++ clang/test/Preprocessor/init-csky.c | 10 +++ clang/test/Preprocessor/init-loongarch.c | 20 +++++ clang/test/Preprocessor/init-mips.c | 60 +++++++++++++ clang/test/Preprocessor/init-ppc.c | 40 +++++++++ clang/test/Preprocessor/init-ppc64.c | 40 +++++++++ clang/test/Preprocessor/init-s390x.c | 10 +++ clang/test/Preprocessor/init-v7k-compat.c | 10 +++ clang/test/Preprocessor/init-ve.c | 10 +++ clang/test/Preprocessor/init-x86.c | 70 +++++++++++++++ clang/test/Preprocessor/init.c | 101 ++++++++++++++++++++++ 14 files changed, 498 insertions(+), 4 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b1238db758845..b63bd366cfe88 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -479,6 +479,17 @@ Non-comprehensive list of changes in this release ``__builtin_elementwise_sub_sat``, ``__builtin_reduce_min`` (For integral element type), ``__builtin_reduce_max`` (For integral element type). +- The builtin macros ``__INT8_C``, ``__INT16_C``, ``__INT32_C``, ``__INT64_C``, + ``__INTMAX_C``, ``__UINT8_C``, ``__UINT16_C``, ``__UINT32_C``, ``__UINT64_C`` + and ``__UINTMAX_C`` have been introduced to ease the implementaton of section + 7.18.4 of ISO/IEC 9899:1999. These macros are also defined by GCC and should + be used instead of others that expand and paste the suffixes provided by + ``__INT8_C_SUFFIX__``, ``__INT16_C_SUFFIX__``, ``__INT32_C_SUFFIX__``, + ``__INT64_C_SUFFIX__``, ``__INTMAX_C_SUFFIX__``, ``__UINT8_C_SUFFIX__``, + ``__UINT16_C_SUFFIX__``, ``__UINT32_C_SUFFIX__``, ``__UINT64_C_SUFFIX__`` and + ``__UINTMAX_C_SUFFIX__``. Pasting suffixes after the expansion of their + respective macros is unsafe, as users can define the suffixes as macros. + - Clang now rejects ``_BitInt`` matrix element types if the bit width is less than ``CHAR_WIDTH`` or not a power of two, matching preexisting behaviour for vector types. diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 29723b573e771..17f624e964539 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -253,6 +253,8 @@ static void DefineExactWidthIntType(const LangOptions &LangOpts, StringRef ConstSuffix(TI.getTypeConstantSuffix(Ty)); Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C_SUFFIX__", ConstSuffix); + Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C(c)", + ConstSuffix.size() ? Twine("c##") + ConstSuffix : "c"); } static void DefineExactWidthIntTypeSize(TargetInfo::IntType Ty, @@ -1164,12 +1166,16 @@ static void InitializePredefinedMacros(const TargetInfo &TI, DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder); DefineFmt(LangOpts, "__INTMAX", TI.getIntMaxType(), TI, Builder); - Builder.defineMacro("__INTMAX_C_SUFFIX__", - TI.getTypeConstantSuffix(TI.getIntMaxType())); + StringRef ConstSuffix(TI.getTypeConstantSuffix(TI.getIntMaxType())); + Builder.defineMacro("__INTMAX_C_SUFFIX__", ConstSuffix); + Builder.defineMacro("__INTMAX_C(c)", + ConstSuffix.size() ? Twine("c##") + ConstSuffix : "c"); DefineType("__UINTMAX_TYPE__", TI.getUIntMaxType(), Builder); DefineFmt(LangOpts, "__UINTMAX", TI.getUIntMaxType(), TI, Builder); - Builder.defineMacro("__UINTMAX_C_SUFFIX__", - TI.getTypeConstantSuffix(TI.getUIntMaxType())); + ConstSuffix = TI.getTypeConstantSuffix(TI.getUIntMaxType()); + Builder.defineMacro("__UINTMAX_C_SUFFIX__", ConstSuffix); + Builder.defineMacro("__UINTMAX_C(c)", + ConstSuffix.size() ? Twine("c##") + ConstSuffix : "c"); DefineType("__PTRDIFF_TYPE__", TI.getPtrDiffType(LangAS::Default), Builder); DefineFmt(LangOpts, "__PTRDIFF", TI.getPtrDiffType(LangAS::Default), TI, Builder); diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c index 8578993dbfaeb..b5e77ba10c347 100644 --- a/clang/test/Preprocessor/init-aarch64.c +++ b/clang/test/Preprocessor/init-aarch64.c @@ -135,26 +135,31 @@ // AARCH64_CXX-NEXT: #define __GLIBCXX_BITSIZE_INT_N_0 128 // AARCH64_CXX-NEXT: #define __GLIBCXX_TYPE_INT_N_0 __int128 // AARCH64-NEXT: #define __HAVE_FUNCTION_MULTI_VERSIONING 1 +// AARCH64-NEXT: #define __INT16_C(c) c // AARCH64-NEXT: #define __INT16_C_SUFFIX__ // AARCH64-NEXT: #define __INT16_FMTd__ "hd" // AARCH64-NEXT: #define __INT16_FMTi__ "hi" // AARCH64-NEXT: #define __INT16_MAX__ 32767 // AARCH64-NEXT: #define __INT16_TYPE__ short +// AARCH64-NEXT: #define __INT32_C(c) c // AARCH64-NEXT: #define __INT32_C_SUFFIX__ // AARCH64-NEXT: #define __INT32_FMTd__ "d" // AARCH64-NEXT: #define __INT32_FMTi__ "i" // AARCH64-NEXT: #define __INT32_MAX__ 2147483647 // AARCH64-NEXT: #define __INT32_TYPE__ int +// AARCH64-NEXT: #define __INT64_C(c) c##L // AARCH64-NEXT: #define __INT64_C_SUFFIX__ L // AARCH64-NEXT: #define __INT64_FMTd__ "ld" // AARCH64-NEXT: #define __INT64_FMTi__ "li" // AARCH64-NEXT: #define __INT64_MAX__ 9223372036854775807L // AARCH64-NEXT: #define __INT64_TYPE__ long int +// AARCH64-NEXT: #define __INT8_C(c) c // AARCH64-NEXT: #define __INT8_C_SUFFIX__ // AARCH64-NEXT: #define __INT8_FMTd__ "hhd" // AARCH64-NEXT: #define __INT8_FMTi__ "hhi" // AARCH64-NEXT: #define __INT8_MAX__ 127 // AARCH64-NEXT: #define __INT8_TYPE__ signed char +// AARCH64-NEXT: #define __INTMAX_C(c) c##L // AARCH64-NEXT: #define __INTMAX_C_SUFFIX__ L // AARCH64-NEXT: #define __INTMAX_FMTd__ "ld" // AARCH64-NEXT: #define __INTMAX_FMTi__ "li" @@ -287,6 +292,7 @@ // AARCH64-NEXT: #define __STDC_UTF_32__ 1 // AARCH64_C: #define __STDC_VERSION__ 201710L // AARCH64-NEXT: #define __STDC__ 1 +// AARCH64-NEXT: #define __UINT16_C(c) c // AARCH64-NEXT: #define __UINT16_C_SUFFIX__ // AARCH64-NEXT: #define __UINT16_FMTX__ "hX" // AARCH64-NEXT: #define __UINT16_FMTo__ "ho" @@ -294,6 +300,7 @@ // AARCH64-NEXT: #define __UINT16_FMTx__ "hx" // AARCH64-NEXT: #define __UINT16_MAX__ 65535 // AARCH64-NEXT: #define __UINT16_TYPE__ unsigned short +// AARCH64-NEXT: #define __UINT32_C(c) c##U // AARCH64-NEXT: #define __UINT32_C_SUFFIX__ U // AARCH64-NEXT: #define __UINT32_FMTX__ "X" // AARCH64-NEXT: #define __UINT32_FMTo__ "o" @@ -301,6 +308,7 @@ // AARCH64-NEXT: #define __UINT32_FMTx__ "x" // AARCH64-NEXT: #define __UINT32_MAX__ 4294967295U // AARCH64-NEXT: #define __UINT32_TYPE__ unsigned int +// AARCH64-NEXT: #define __UINT64_C(c) c##UL // AARCH64-NEXT: #define __UINT64_C_SUFFIX__ UL // AARCH64-NEXT: #define __UINT64_FMTX__ "lX" // AARCH64-NEXT: #define __UINT64_FMTo__ "lo" @@ -308,6 +316,7 @@ // AARCH64-NEXT: #define __UINT64_FMTx__ "lx" // AARCH64-NEXT: #define __UINT64_MAX__ 18446744073709551615UL // AARCH64-NEXT: #define __UINT64_TYPE__ long unsigned int +// AARCH64-NEXT: #define __UINT8_C(c) c // AARCH64-NEXT: #define __UINT8_C_SUFFIX__ // AARCH64-NEXT: #define __UINT8_FMTX__ "hhX" // AARCH64-NEXT: #define __UINT8_FMTo__ "hho" @@ -315,6 +324,7 @@ // AARCH64-NEXT: #define __UINT8_FMTx__ "hhx" // AARCH64-NEXT: #define __UINT8_MAX__ 255 // AARCH64-NEXT: #define __UINT8_TYPE__ unsigned char +// AARCH64-NEXT: #define __UINTMAX_C(c) c##UL // AARCH64-NEXT: #define __UINTMAX_C_SUFFIX__ UL // AARCH64-NEXT: #define __UINTMAX_FMTX__ "lX" // AARCH64-NEXT: #define __UINTMAX_FMTo__ "lo" @@ -435,26 +445,31 @@ // AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F // AARCH64-DARWIN: #define __FLT_RADIX__ 2 // AARCH64-DARWIN: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430 +// AARCH64-DARWIN: #define __INT16_C(c) c // AARCH64-DARWIN: #define __INT16_C_SUFFIX__ // AARCH64-DARWIN: #define __INT16_FMTd__ "hd" // AARCH64-DARWIN: #define __INT16_FMTi__ "hi" // AARCH64-DARWIN: #define __INT16_MAX__ 32767 // AARCH64-DARWIN: #define __INT16_TYPE__ short +// AARCH64-DARWIN: #define __INT32_C(c) c // AARCH64-DARWIN: #define __INT32_C_SUFFIX__ // AARCH64-DARWIN: #define __INT32_FMTd__ "d" // AARCH64-DARWIN: #define __INT32_FMTi__ "i" // AARCH64-DARWIN: #define __INT32_MAX__ 2147483647 // AARCH64-DARWIN: #define __INT32_TYPE__ int +// AARCH64-DARWIN: #define __INT64_C(c) c##LL // AARCH64-DARWIN: #define __INT64_C_SUFFIX__ LL // AARCH64-DARWIN: #define __INT64_FMTd__ "lld" // AARCH64-DARWIN: #define __INT64_FMTi__ "lli" // AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807LL // AARCH64-DARWIN: #define __INT64_TYPE__ long long int +// AARCH64-DARWIN: #define __INT8_C(c) c // AARCH64-DARWIN: #define __INT8_C_SUFFIX__ // AARCH64-DARWIN: #define __INT8_FMTd__ "hhd" // AARCH64-DARWIN: #define __INT8_FMTi__ "hhi" // AARCH64-DARWIN: #define __INT8_MAX__ 127 // AARCH64-DARWIN: #define __INT8_TYPE__ signed char +// AARCH64-DARWIN: #define __INTMAX_C(c) c##L // AARCH64-DARWIN: #define __INTMAX_C_SUFFIX__ L // AARCH64-DARWIN: #define __INTMAX_FMTd__ "ld" // AARCH64-DARWIN: #define __INTMAX_FMTi__ "li" @@ -538,18 +553,23 @@ // AARCH64-DARWIN: #define __SIZE_MAX__ 18446744073709551615UL // AARCH64-DARWIN: #define __SIZE_TYPE__ long unsigned int // AARCH64-DARWIN: #define __SIZE_WIDTH__ 64 +// AARCH64-DARWIN: #define __UINT16_C(c) c // AARCH64-DARWIN: #define __UINT16_C_SUFFIX__ // AARCH64-DARWIN: #define __UINT16_MAX__ 65535 // AARCH64-DARWIN: #define __UINT16_TYPE__ unsigned short +// AARCH64-DARWIN: #define __UINT32_C(c) c##U // AARCH64-DARWIN: #define __UINT32_C_SUFFIX__ U // AARCH64-DARWIN: #define __UINT32_MAX__ 4294967295U // AARCH64-DARWIN: #define __UINT32_TYPE__ unsigned int +// AARCH64-DARWIN: #define __UINT64_C(c) c##ULL // AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ ULL // AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615ULL // AARCH64-DARWIN: #define __UINT64_TYPE__ long long unsigned int +// AARCH64-DARWIN: #define __UINT8_C(c) c // AARCH64-DARWIN: #define __UINT8_C_SUFFIX__ // AARCH64-DARWIN: #define __UINT8_MAX__ 255 // AARCH64-DARWIN: #define __UINT8_TYPE__ unsigned char +// AARCH64-DARWIN: #define __UINTMAX_C(c) c##UL // AARCH64-DARWIN: #define __UINTMAX_C_SUFFIX__ UL // AARCH64-DARWIN: #define __UINTMAX_MAX__ 18446744073709551615UL // AARCH64-DARWIN: #define __UINTMAX_TYPE__ long unsigned int @@ -703,18 +723,23 @@ // AARCH64-MSVC: #define __STDC_UTF_32__ 1 // AARCH64-MSVC: #define __STDC_VERSION__ 201710L // AARCH64-MSVC: #define __STDC__ 1 +// AARCH64-MSVC: #define __UINT16_C(c) c // AARCH64-MSVC: #define __UINT16_C_SUFFIX__ // AARCH64-MSVC: #define __UINT16_MAX__ 65535 // AARCH64-MSVC: #define __UINT16_TYPE__ unsigned short +// AARCH64-MSVC: #define __UINT32_C(c) c##U // AARCH64-MSVC: #define __UINT32_C_SUFFIX__ U // AARCH64-MSVC: #define __UINT32_MAX__ 4294967295U // AARCH64-MSVC: #define __UINT32_TYPE__ unsigned int +// AARCH64-MSVC: #define __UINT64_C(c) c##ULL // AARCH64-MSVC: #define __UINT64_C_SUFFIX__ ULL // AARCH64-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL // AARCH64-MSVC: #define __UINT64_TYPE__ long long unsigned int +// AARCH64-MSVC: #define __UINT8_C(c) c // AARCH64-MSVC: #define __UINT8_C_SUFFIX__ // AARCH64-MSVC: #define __UINT8_MAX__ 255 // AARCH64-MSVC: #define __UINT8_TYPE__ unsigned char +// AARCH64-MSVC: #define __UINTMAX_C(c) c##ULL // AARCH64-MSVC: #define __UINTMAX_C_SUFFIX__ ULL // AARCH64-MSVC: #define __UINTMAX_MAX__ 18446744073709551615ULL // AARCH64-MSVC: #define __UINTMAX_TYPE__ long long unsigned int @@ -867,26 +892,31 @@ // ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1 // ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1 // ARM64EC-MSVC: #define __HAVE_FUNCTION_MULTI_VERSIONING 1 +// ARM64EC-MSVC: #define __INT16_C(c) c // ARM64EC-MSVC: #define __INT16_C_SUFFIX__ // ARM64EC-MSVC: #define __INT16_FMTd__ "hd" // ARM64EC-MSVC: #define __INT16_FMTi__ "hi" // ARM64EC-MSVC: #define __INT16_MAX__ 32767 // ARM64EC-MSVC: #define __INT16_TYPE__ short +// ARM64EC-MSVC: #define __INT32_C(c) c // ARM64EC-MSVC: #define __INT32_C_SUFFIX__ // ARM64EC-MSVC: #define __INT32_FMTd__ "d" // ARM64EC-MSVC: #define __INT32_FMTi__ "i" // ARM64EC-MSVC: #define __INT32_MAX__ 2147483647 // ARM64EC-MSVC: #define __INT32_TYPE__ int +// ARM64EC-MSVC: #define __INT64_C(c) c##LL // ARM64EC-MSVC: #define __INT64_C_SUFFIX__ LL // ARM64EC-MSVC: #define __INT64_FMTd__ "lld" // ARM64EC-MSVC: #define __INT64_FMTi__ "lli" // ARM64EC-MSVC: #define __INT64_MAX__ 9223372036854775807LL // ARM64EC-MSVC: #define __INT64_TYPE__ long long int +// ARM64EC-MSVC: #define __INT8_C(c) c // ARM64EC-MSVC: #define __INT8_C_SUFFIX__ // ARM64EC-MSVC: #define __INT8_FMTd__ "hhd" // ARM64EC-MSVC: #define __INT8_FMTi__ "hhi" // ARM64EC-MSVC: #define __INT8_MAX__ 127 // ARM64EC-MSVC: #define __INT8_TYPE__ signed char +// ARM64EC-MSVC: #define __INTMAX_C(c) c##LL // ARM64EC-MSVC: #define __INTMAX_C_SUFFIX__ LL // ARM64EC-MSVC: #define __INTMAX_FMTd__ "lld" // ARM64EC-MSVC: #define __INTMAX_FMTi__ "lli" @@ -1013,6 +1043,7 @@ // ARM64EC-MSVC: #define __STDC_UTF_32__ 1 // ARM64EC-MSVC: #define __STDC_VERSION__ 201710L // ARM64EC-MSVC: #define __STDC__ 1 +// ARM64EC-MSVC: #define __UINT16_C(c) c // ARM64EC-MSVC: #define __UINT16_C_SUFFIX__ // ARM64EC-MSVC: #define __UINT16_FMTX__ "hX" // ARM64EC-MSVC: #define __UINT16_FMTo__ "ho" @@ -1020,6 +1051,7 @@ // ARM64EC-MSVC: #define __UINT16_FMTx__ "hx" // ARM64EC-MSVC: #define __UINT16_MAX__ 65535 // ARM64EC-MSVC: #define __UINT16_TYPE__ unsigned short +// ARM64EC-MSVC: #define __UINT32_C(c) c##U // ARM64EC-MSVC: #define __UINT32_C_SUFFIX__ U // ARM64EC-MSVC: #define __UINT32_FMTX__ "X" // ARM64EC-MSVC: #define __UINT32_FMTo__ "o" @@ -1027,6 +1059,7 @@ // ARM64EC-MSVC: #define __UINT32_FMTx__ "x" // ARM64EC-MSVC: #define __UINT32_MAX__ 4294967295U // ARM64EC-MSVC: #define __UINT32_TYPE__ unsigned int +// ARM64EC-MSVC: #define __UINT64_C(c) c##ULL // ARM64EC-MSVC: #define __UINT64_C_SUFFIX__ ULL // ARM64EC-MSVC: #define __UINT64_FMTX__ "llX" // ARM64EC-MSVC: #define __UINT64_FMTo__ "llo" @@ -1034,6 +1067,7 @@ // ARM64EC-MSVC: #define __UINT64_FMTx__ "llx" // ARM64EC-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL // ARM64EC-MSVC: #define __UINT64_TYPE__ long long unsigned int +// ARM64EC-MSVC: #define __UINT8_C(c) c // ARM64EC-MSVC: #define __UINT8_C_SUFFIX__ // ARM64EC-MSVC: #define __UINT8_FMTX__ "hhX" // ARM64EC-MSVC: #define __UINT8_FMTo__ "hho" @@ -1041,6 +1075,7 @@ // ARM64EC-MSVC: #define __UINT8_FMTx__ "hhx" // ARM64EC-MSVC: #define __UINT8_MAX__ 255 // ARM64EC-MSVC: #define __UINT8_TYPE__ unsigned char +// ARM64EC-MSVC: #define __UINTMAX_C(c) c##ULL // ARM64EC-MSVC: #define __UINTMAX_C_SUFFIX__ ULL // ARM64EC-MSVC: #define __UINTMAX_FMTX__ "llX" // ARM64EC-MSVC: #define __UINTMAX_FMTo__ "llo" diff --git a/clang/test/Preprocessor/init-arm.c b/clang/test/Preprocessor/init-arm.c index 6e3acacc5c3a5..d2fcfe94bcd3d 100644 --- a/clang/test/Preprocessor/init-arm.c +++ b/clang/test/Preprocessor/init-arm.c @@ -46,26 +46,31 @@ // ARM:#define __FLT_MIN_EXP__ (-125) // ARM:#define __FLT_MIN__ 1.17549435e-38F // ARM:#define __FLT_RADIX__ 2 +// ARM:#define __INT16_C(c) c // ARM:#define __INT16_C_SUFFIX__ // ARM:#define __INT16_FMTd__ "hd" // ARM:#define __INT16_FMTi__ "hi" // ARM:#define __INT16_MAX__ 32767 // ARM:#define __INT16_TYPE__ short +// ARM:#define __INT32_C(c) c // ARM:#define __INT32_C_SUFFIX__ // ARM:#define __INT32_FMTd__ "d" // ARM:#define __INT32_FMTi__ "i" // ARM:#define __INT32_MAX__ 2147483647 // ARM:#define __INT32_TYPE__ int +// ARM:#define __INT64_C(c) c##LL // ARM:#define __INT64_C_SUFFIX__ LL // ARM:#define __INT64_FMTd__ "lld" // ARM:#define __INT64_FMTi__ "lli" // ARM:#define __INT64_MAX__ 9223372036854775807LL // ARM:#define __INT64_TYPE__ long long int +// ARM:#define __INT8_C(c) c // ARM:#define __INT8_C_SUFFIX__ // ARM:#define __INT8_FMTd__ "hhd" // ARM:#define __INT8_FMTi__ "hhi" // ARM:#define __INT8_MAX__ 127 // ARM:#define __INT8_TYPE__ signed char +// ARM:#define __INTMAX_C(c) c##LL // ARM:#define __INTMAX_C_SUFFIX__ LL // ARM:#define __INTMAX_FMTd__ "lld" // ARM:#define __INTMAX_FMTi__ "lli" @@ -151,18 +156,23 @@ // ARM:#define __SIZE_TYPE__ unsigned int // ARM:#define __SIZE_WIDTH__ 32 // ARM-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U +// ARM:#define __UINT16_C(c) c // ARM:#define __UINT16_C_SUFFIX__ // ARM:#define __UINT16_MAX__ 65535 // ARM:#define __UINT16_TYPE__ unsigned short +// ARM:#define __UINT32_C(c) c##U // ARM:#define __UINT32_C_SUFFIX__ U // ARM:#define __UINT32_MAX__ 4294967295U // ARM:#define __UINT32_TYPE__ unsigned int +// ARM:#define __UINT64_C(c) c##ULL // ARM:#define __UINT64_C_SUFFIX__ ULL // ARM:#define __UINT64_MAX__ 18446744073709551615ULL // ARM:#define __UINT64_TYPE__ long long unsigned int +// ARM:#define __UINT8_C(c) c // ARM:#define __UINT8_C_SUFFIX__ // ARM:#define __UINT8_MAX__ 255 // ARM:#define __UINT8_TYPE__ unsigned char +// ARM:#define __UINTMAX_C(c) c##ULL // ARM:#define __UINTMAX_C_SUFFIX__ ULL // ARM:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARM:#define __UINTMAX_TYPE__ long long unsigned int @@ -248,26 +258,31 @@ // ARM-BE:#define __FLT_MIN_EXP__ (-125) // ARM-BE:#define __FLT_MIN__ 1.17549435e-38F // ARM-BE:#define __FLT_RADIX__ 2 +// ARM-BE:#define __INT16_C(c) c // ARM-BE:#define __INT16_C_SUFFIX__ // ARM-BE:#define __INT16_FMTd__ "hd" // ARM-BE:#define __INT16_FMTi__ "hi" // ARM-BE:#define __INT16_MAX__ 32767 // ARM-BE:#define __INT16_TYPE__ short +// ARM-BE:#define __INT32_C(c) c // ARM-BE:#define __INT32_C_SUFFIX__ // ARM-BE:#define __INT32_FMTd__ "d" // ARM-BE:#define __INT32_FMTi__ "i" // ARM-BE:#define __INT32_MAX__ 2147483647 // ARM-BE:#define __INT32_TYPE__ int +// ARM-BE:#define __INT64_C(c) c##LL // ARM-BE:#define __INT64_C_SUFFIX__ LL // ARM-BE:#define __INT64_FMTd__ "lld" // ARM-BE:#define __INT64_FMTi__ "lli" // ARM-BE:#define __INT64_MAX__ 9223372036854775807LL // ARM-BE:#define __INT64_TYPE__ long long int +// ARM-BE:#define __INT8_C(c) c // ARM-BE:#define __INT8_C_SUFFIX__ // ARM-BE:#define __INT8_FMTd__ "hhd" // ARM-BE:#define __INT8_FMTi__ "hhi" // ARM-BE:#define __INT8_MAX__ 127 // ARM-BE:#define __INT8_TYPE__ signed char +// ARM-BE:#define __INTMAX_C(c) c##LL // ARM-BE:#define __INTMAX_C_SUFFIX__ LL // ARM-BE:#define __INTMAX_FMTd__ "lld" // ARM-BE:#define __INTMAX_FMTi__ "lli" @@ -351,18 +366,23 @@ // ARM-BE:#define __SIZE_MAX__ 4294967295U // ARM-BE:#define __SIZE_TYPE__ unsigned int // ARM-BE:#define __SIZE_WIDTH__ 32 +// ARM-BE:#define __UINT16_C(c) c // ARM-BE:#define __UINT16_C_SUFFIX__ // ARM-BE:#define __UINT16_MAX__ 65535 // ARM-BE:#define __UINT16_TYPE__ unsigned short +// ARM-BE:#define __UINT32_C(c) c##U // ARM-BE:#define __UINT32_C_SUFFIX__ U // ARM-BE:#define __UINT32_MAX__ 4294967295U // ARM-BE:#define __UINT32_TYPE__ unsigned int +// ARM-BE:#define __UINT64_C(c) c##ULL // ARM-BE:#define __UINT64_C_SUFFIX__ ULL // ARM-BE:#define __UINT64_MAX__ 18446744073709551615ULL // ARM-BE:#define __UINT64_TYPE__ long long unsigned int +// ARM-BE:#define __UINT8_C(c) c // ARM-BE:#define __UINT8_C_SUFFIX__ // ARM-BE:#define __UINT8_MAX__ 255 // ARM-BE:#define __UINT8_TYPE__ unsigned char +// ARM-BE:#define __UINTMAX_C(c) c##ULL // ARM-BE:#define __UINTMAX_C_SUFFIX__ ULL // ARM-BE:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARM-BE:#define __UINTMAX_TYPE__ long long unsigned int @@ -440,26 +460,31 @@ // ARMEABISOFT:#define __FLT_MIN_EXP__ (-125) // ARMEABISOFT:#define __FLT_MIN__ 1.17549435e-38F // ARMEABISOFT:#define __FLT_RADIX__ 2 +// ARMEABISOFT:#define __INT16_C(c) c // ARMEABISOFT:#define __INT16_C_SUFFIX__ // ARMEABISOFT:#define __INT16_FMTd__ "hd" // ARMEABISOFT:#define __INT16_FMTi__ "hi" // ARMEABISOFT:#define __INT16_MAX__ 32767 // ARMEABISOFT:#define __INT16_TYPE__ short +// ARMEABISOFT:#define __INT32_C(c) c // ARMEABISOFT:#define __INT32_C_SUFFIX__ // ARMEABISOFT:#define __INT32_FMTd__ "d" // ARMEABISOFT:#define __INT32_FMTi__ "i" // ARMEABISOFT:#define __INT32_MAX__ 2147483647 // ARMEABISOFT:#define __INT32_TYPE__ int +// ARMEABISOFT:#define __INT64_C(c) c##LL // ARMEABISOFT:#define __INT64_C_SUFFIX__ LL // ARMEABISOFT:#define __INT64_FMTd__ "lld" // ARMEABISOFT:#define __INT64_FMTi__ "lli" // ARMEABISOFT:#define __INT64_MAX__ 9223372036854775807LL // ARMEABISOFT:#define __INT64_TYPE__ long long int +// ARMEABISOFT:#define __INT8_C(c) c // ARMEABISOFT:#define __INT8_C_SUFFIX__ // ARMEABISOFT:#define __INT8_FMTd__ "hhd" // ARMEABISOFT:#define __INT8_FMTi__ "hhi" // ARMEABISOFT:#define __INT8_MAX__ 127 // ARMEABISOFT:#define __INT8_TYPE__ signed char +// ARMEABISOFT:#define __INTMAX_C(c) c##LL // ARMEABISOFT:#define __INTMAX_C_SUFFIX__ LL // ARMEABISOFT:#define __INTMAX_FMTd__ "lld" // ARMEABISOFT:#define __INTMAX_FMTi__ "lli" @@ -545,18 +570,23 @@ // ARMEABISOFT:#define __SIZE_TYPE__ unsigned int // ARMEABISOFT:#define __SIZE_WIDTH__ 32 // ARMEABISOFT:#define __SOFTFP__ 1 +// ARMEABISOFT:#define __UINT16_C(c) c // ARMEABISOFT:#define __UINT16_C_SUFFIX__ // ARMEABISOFT:#define __UINT16_MAX__ 65535 // ARMEABISOFT:#define __UINT16_TYPE__ unsigned short +// ARMEABISOFT:#define __UINT32_C(c) c##U // ARMEABISOFT:#define __UINT32_C_SUFFIX__ U // ARMEABISOFT:#define __UINT32_MAX__ 4294967295U // ARMEABISOFT:#define __UINT32_TYPE__ unsigned int +// ARMEABISOFT:#define __UINT64_C(c) c##ULL // ARMEABISOFT:#define __UINT64_C_SUFFIX__ ULL // ARMEABISOFT:#define __UINT64_MAX__ 18446744073709551615ULL // ARMEABISOFT:#define __UINT64_TYPE__ long long unsigned int +// ARMEABISOFT:#define __UINT8_C(c) c // ARMEABISOFT:#define __UINT8_C_SUFFIX__ // ARMEABISOFT:#define __UINT8_MAX__ 255 // ARMEABISOFT:#define __UINT8_TYPE__ unsigned char +// ARMEABISOFT:#define __UINTMAX_C(c) c##ULL // ARMEABISOFT:#define __UINTMAX_C_SUFFIX__ ULL // ARMEABISOFT:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARMEABISOFT:#define __UINTMAX_TYPE__ long long unsigned int @@ -640,26 +670,31 @@ // ARMEABISOFTFP_NOFP:#define __FLT_MIN_EXP__ (-125) // ARMEABISOFTFP_NOFP:#define __FLT_MIN__ 1.17549435e-38F // ARMEABISOFTFP_NOFP:#define __FLT_RADIX__ 2 +// ARMEABISOFTFP_NOFP:#define __INT16_C(c) c // ARMEABISOFTFP_NOFP:#define __INT16_C_SUFFIX__ // ARMEABISOFTFP_NOFP:#define __INT16_FMTd__ "hd" // ARMEABISOFTFP_NOFP:#define __INT16_FMTi__ "hi" // ARMEABISOFTFP_NOFP:#define __INT16_MAX__ 32767 // ARMEABISOFTFP_NOFP:#define __INT16_TYPE__ short +// ARMEABISOFTFP_NOFP:#define __INT32_C(c) c // ARMEABISOFTFP_NOFP:#define __INT32_C_SUFFIX__ // ARMEABISOFTFP_NOFP:#define __INT32_FMTd__ "d" // ARMEABISOFTFP_NOFP:#define __INT32_FMTi__ "i" // ARMEABISOFTFP_NOFP:#define __INT32_MAX__ 2147483647 // ARMEABISOFTFP_NOFP:#define __INT32_TYPE__ int +// ARMEABISOFTFP_NOFP:#define __INT64_C(c) c##LL // ARMEABISOFTFP_NOFP:#define __INT64_C_SUFFIX__ LL // ARMEABISOFTFP_NOFP:#define __INT64_FMTd__ "lld" // ARMEABISOFTFP_NOFP:#define __INT64_FMTi__ "lli" // ARMEABISOFTFP_NOFP:#define __INT64_MAX__ 9223372036854775807LL // ARMEABISOFTFP_NOFP:#define __INT64_TYPE__ long long int +// ARMEABISOFTFP_NOFP:#define __INT8_C(c) c // ARMEABISOFTFP_NOFP:#define __INT8_C_SUFFIX__ // ARMEABISOFTFP_NOFP:#define __INT8_FMTd__ "hhd" // ARMEABISOFTFP_NOFP:#define __INT8_FMTi__ "hhi" // ARMEABISOFTFP_NOFP:#define __INT8_MAX__ 127 // ARMEABISOFTFP_NOFP:#define __INT8_TYPE__ signed char +// ARMEABISOFTFP_NOFP:#define __INTMAX_C(c) c##LL // ARMEABISOFTFP_NOFP:#define __INTMAX_C_SUFFIX__ LL // ARMEABISOFTFP_NOFP:#define __INTMAX_FMTd__ "lld" // ARMEABISOFTFP_NOFP:#define __INTMAX_FMTi__ "lli" @@ -745,18 +780,23 @@ // ARMEABISOFTFP_NOFP:#define __SIZE_TYPE__ unsigned int // ARMEABISOFTFP_NOFP:#define __SIZE_WIDTH__ 32 // ARMEABISOFTFP_NOFP:#define __SOFTFP__ 1 +// ARMEABISOFTFP_NOFP:#define __UINT16_C(c) c // ARMEABISOFTFP_NOFP:#define __UINT16_C_SUFFIX__ // ARMEABISOFTFP_NOFP:#define __UINT16_MAX__ 65535 // ARMEABISOFTFP_NOFP:#define __UINT16_TYPE__ unsigned short +// ARMEABISOFTFP_NOFP:#define __UINT32_C(c) c##U // ARMEABISOFTFP_NOFP:#define __UINT32_C_SUFFIX__ U // ARMEABISOFTFP_NOFP:#define __UINT32_MAX__ 4294967295U // ARMEABISOFTFP_NOFP:#define __UINT32_TYPE__ unsigned int +// ARMEABISOFTFP_NOFP:#define __UINT64_C(c) c##ULL // ARMEABISOFTFP_NOFP:#define __UINT64_C_SUFFIX__ ULL // ARMEABISOFTFP_NOFP:#define __UINT64_MAX__ 18446744073709551615ULL // ARMEABISOFTFP_NOFP:#define __UINT64_TYPE__ long long unsigned int +// ARMEABISOFTFP_NOFP:#define __UINT8_C(c) c // ARMEABISOFTFP_NOFP:#define __UINT8_C_SUFFIX__ // ARMEABISOFTFP_NOFP:#define __UINT8_MAX__ 255 // ARMEABISOFTFP_NOFP:#define __UINT8_TYPE__ unsigned char +// ARMEABISOFTFP_NOFP:#define __UINTMAX_C(c) c##ULL // ARMEABISOFTFP_NOFP:#define __UINTMAX_C_SUFFIX__ ULL // ARMEABISOFTFP_NOFP:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARMEABISOFTFP_NOFP:#define __UINTMAX_TYPE__ long long unsigned int @@ -834,26 +874,31 @@ // ARMEABISOFTFP_FP:#define __FLT_MIN_EXP__ (-125) // ARMEABISOFTFP_FP:#define __FLT_MIN__ 1.17549435e-38F // ARMEABISOFTFP_FP:#define __FLT_RADIX__ 2 +// ARMEABISOFTFP_FP:#define __INT16_C(c) c // ARMEABISOFTFP_FP:#define __INT16_C_SUFFIX__ // ARMEABISOFTFP_FP:#define __INT16_FMTd__ "hd" // ARMEABISOFTFP_FP:#define __INT16_FMTi__ "hi" // ARMEABISOFTFP_FP:#define __INT16_MAX__ 32767 // ARMEABISOFTFP_FP:#define __INT16_TYPE__ short +// ARMEABISOFTFP_FP:#define __INT32_C(c) c // ARMEABISOFTFP_FP:#define __INT32_C_SUFFIX__ // ARMEABISOFTFP_FP:#define __INT32_FMTd__ "d" // ARMEABISOFTFP_FP:#define __INT32_FMTi__ "i" // ARMEABISOFTFP_FP:#define __INT32_MAX__ 2147483647 // ARMEABISOFTFP_FP:#define __INT32_TYPE__ int +// ARMEABISOFTFP_FP:#define __INT64_C(c) c##LL // ARMEABISOFTFP_FP:#define __INT64_C_SUFFIX__ LL // ARMEABISOFTFP_FP:#define __INT64_FMTd__ "lld" // ARMEABISOFTFP_FP:#define __INT64_FMTi__ "lli" // ARMEABISOFTFP_FP:#define __INT64_MAX__ 9223372036854775807LL // ARMEABISOFTFP_FP:#define __INT64_TYPE__ long long int +// ARMEABISOFTFP_FP:#define __INT8_C(c) c // ARMEABISOFTFP_FP:#define __INT8_C_SUFFIX__ // ARMEABISOFTFP_FP:#define __INT8_FMTd__ "hhd" // ARMEABISOFTFP_FP:#define __INT8_FMTi__ "hhi" // ARMEABISOFTFP_FP:#define __INT8_MAX__ 127 // ARMEABISOFTFP_FP:#define __INT8_TYPE__ signed char +// ARMEABISOFTFP_FP:#define __INTMAX_C(c) c##LL // ARMEABISOFTFP_FP:#define __INTMAX_C_SUFFIX__ LL // ARMEABISOFTFP_FP:#define __INTMAX_FMTd__ "lld" // ARMEABISOFTFP_FP:#define __INTMAX_FMTi__ "lli" @@ -939,18 +984,23 @@ // ARMEABISOFTFP_FP:#define __SIZE_TYPE__ unsigned int // ARMEABISOFTFP_FP:#define __SIZE_WIDTH__ 32 // ARMEABISOFTFP_FP-NOT:#define __SOFTFP__ 1 +// ARMEABISOFTFP_FP:#define __UINT16_C(c) c // ARMEABISOFTFP_FP:#define __UINT16_C_SUFFIX__ // ARMEABISOFTFP_FP:#define __UINT16_MAX__ 65535 // ARMEABISOFTFP_FP:#define __UINT16_TYPE__ unsigned short +// ARMEABISOFTFP_FP:#define __UINT32_C(c) c##U // ARMEABISOFTFP_FP:#define __UINT32_C_SUFFIX__ U // ARMEABISOFTFP_FP:#define __UINT32_MAX__ 4294967295U // ARMEABISOFTFP_FP:#define __UINT32_TYPE__ unsigned int +// ARMEABISOFTFP_FP:#define __UINT64_C(c) c##ULL // ARMEABISOFTFP_FP:#define __UINT64_C_SUFFIX__ ULL // ARMEABISOFTFP_FP:#define __UINT64_MAX__ 18446744073709551615ULL // ARMEABISOFTFP_FP:#define __UINT64_TYPE__ long long unsigned int +// ARMEABISOFTFP_FP:#define __UINT8_C(c) c // ARMEABISOFTFP_FP:#define __UINT8_C_SUFFIX__ // ARMEABISOFTFP_FP:#define __UINT8_MAX__ 255 // ARMEABISOFTFP_FP:#define __UINT8_TYPE__ unsigned char +// ARMEABISOFTFP_FP:#define __UINTMAX_C(c) c##ULL // ARMEABISOFTFP_FP:#define __UINTMAX_C_SUFFIX__ ULL // ARMEABISOFTFP_FP:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARMEABISOFTFP_FP:#define __UINTMAX_TYPE__ long long unsigned int @@ -1028,26 +1078,31 @@ // ARMEABIHARDFP:#define __FLT_MIN_EXP__ (-125) // ARMEABIHARDFP:#define __FLT_MIN__ 1.17549435e-38F // ARMEABIHARDFP:#define __FLT_RADIX__ 2 +// ARMEABIHARDFP:#define __INT16_C(c) c // ARMEABIHARDFP:#define __INT16_C_SUFFIX__ // ARMEABIHARDFP:#define __INT16_FMTd__ "hd" // ARMEABIHARDFP:#define __INT16_FMTi__ "hi" // ARMEABIHARDFP:#define __INT16_MAX__ 32767 // ARMEABIHARDFP:#define __INT16_TYPE__ short +// ARMEABIHARDFP:#define __INT32_C(c) c // ARMEABIHARDFP:#define __INT32_C_SUFFIX__ // ARMEABIHARDFP:#define __INT32_FMTd__ "d" // ARMEABIHARDFP:#define __INT32_FMTi__ "i" // ARMEABIHARDFP:#define __INT32_MAX__ 2147483647 // ARMEABIHARDFP:#define __INT32_TYPE__ int +// ARMEABIHARDFP:#define __INT64_C(c) c##LL // ARMEABIHARDFP:#define __INT64_C_SUFFIX__ LL // ARMEABIHARDFP:#define __INT64_FMTd__ "lld" // ARMEABIHARDFP:#define __INT64_FMTi__ "lli" // ARMEABIHARDFP:#define __INT64_MAX__ 9223372036854775807LL // ARMEABIHARDFP:#define __INT64_TYPE__ long long int +// ARMEABIHARDFP:#define __INT8_C(c) c // ARMEABIHARDFP:#define __INT8_C_SUFFIX__ // ARMEABIHARDFP:#define __INT8_FMTd__ "hhd" // ARMEABIHARDFP:#define __INT8_FMTi__ "hhi" // ARMEABIHARDFP:#define __INT8_MAX__ 127 // ARMEABIHARDFP:#define __INT8_TYPE__ signed char +// ARMEABIHARDFP:#define __INTMAX_C(c) c##LL // ARMEABIHARDFP:#define __INTMAX_C_SUFFIX__ LL // ARMEABIHARDFP:#define __INTMAX_FMTd__ "lld" // ARMEABIHARDFP:#define __INTMAX_FMTi__ "lli" @@ -1133,18 +1188,23 @@ // ARMEABIHARDFP:#define __SIZE_TYPE__ unsigned int // ARMEABIHARDFP:#define __SIZE_WIDTH__ 32 // ARMEABIHARDFP-NOT:#define __SOFTFP__ 1 +// ARMEABIHARDFP:#define __UINT16_C(c) c // ARMEABIHARDFP:#define __UINT16_C_SUFFIX__ // ARMEABIHARDFP:#define __UINT16_MAX__ 65535 // ARMEABIHARDFP:#define __UINT16_TYPE__ unsigned short +// ARMEABIHARDFP:#define __UINT32_C(c) c##U // ARMEABIHARDFP:#define __UINT32_C_SUFFIX__ U // ARMEABIHARDFP:#define __UINT32_MAX__ 4294967295U // ARMEABIHARDFP:#define __UINT32_TYPE__ unsigned int +// ARMEABIHARDFP:#define __UINT64_C(c) c##ULL // ARMEABIHARDFP:#define __UINT64_C_SUFFIX__ ULL // ARMEABIHARDFP:#define __UINT64_MAX__ 18446744073709551615ULL // ARMEABIHARDFP:#define __UINT64_TYPE__ long long unsigned int +// ARMEABIHARDFP:#define __UINT8_C(c) c // ARMEABIHARDFP:#define __UINT8_C_SUFFIX__ // ARMEABIHARDFP:#define __UINT8_MAX__ 255 // ARMEABIHARDFP:#define __UINT8_TYPE__ unsigned char +// ARMEABIHARDFP:#define __UINTMAX_C(c) c##ULL // ARMEABIHARDFP:#define __UINTMAX_C_SUFFIX__ ULL // ARMEABIHARDFP:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARMEABIHARDFP:#define __UINTMAX_TYPE__ long long unsigned int @@ -1220,26 +1280,31 @@ // ARM-NETBSD:#define __FLT_MIN_EXP__ (-125) // ARM-NETBSD:#define __FLT_MIN__ 1.17549435e-38F // ARM-NETBSD:#define __FLT_RADIX__ 2 +// ARM-NETBSD:#define __INT16_C(c) c // ARM-NETBSD:#define __INT16_C_SUFFIX__ // ARM-NETBSD:#define __INT16_FMTd__ "hd" // ARM-NETBSD:#define __INT16_FMTi__ "hi" // ARM-NETBSD:#define __INT16_MAX__ 32767 // ARM-NETBSD:#define __INT16_TYPE__ short +// ARM-NETBSD:#define __INT32_C(c) c // ARM-NETBSD:#define __INT32_C_SUFFIX__ // ARM-NETBSD:#define __INT32_FMTd__ "d" // ARM-NETBSD:#define __INT32_FMTi__ "i" // ARM-NETBSD:#define __INT32_MAX__ 2147483647 // ARM-NETBSD:#define __INT32_TYPE__ int +// ARM-NETBSD:#define __INT64_C(c) c##LL // ARM-NETBSD:#define __INT64_C_SUFFIX__ LL // ARM-NETBSD:#define __INT64_FMTd__ "lld" // ARM-NETBSD:#define __INT64_FMTi__ "lli" // ARM-NETBSD:#define __INT64_MAX__ 9223372036854775807LL // ARM-NETBSD:#define __INT64_TYPE__ long long int +// ARM-NETBSD:#define __INT8_C(c) c // ARM-NETBSD:#define __INT8_C_SUFFIX__ // ARM-NETBSD:#define __INT8_FMTd__ "hhd" // ARM-NETBSD:#define __INT8_FMTi__ "hhi" // ARM-NETBSD:#define __INT8_MAX__ 127 // ARM-NETBSD:#define __INT8_TYPE__ signed char +// ARM-NETBSD:#define __INTMAX_C(c) c##LL // ARM-NETBSD:#define __INTMAX_C_SUFFIX__ LL // ARM-NETBSD:#define __INTMAX_FMTd__ "lld" // ARM-NETBSD:#define __INTMAX_FMTi__ "lli" @@ -1325,18 +1390,23 @@ // ARM-NETBSD:#define __SIZE_TYPE__ long unsigned int // ARM-NETBSD:#define __SIZE_WIDTH__ 32 // ARM-NETBSD:#define __SOFTFP__ 1 +// ARM-NETBSD:#define __UINT16_C(c) c // ARM-NETBSD:#define __UINT16_C_SUFFIX__ // ARM-NETBSD:#define __UINT16_MAX__ 65535 // ARM-NETBSD:#define __UINT16_TYPE__ unsigned short +// ARM-NETBSD:#define __UINT32_C(c) c##U // ARM-NETBSD:#define __UINT32_C_SUFFIX__ U // ARM-NETBSD:#define __UINT32_MAX__ 4294967295U // ARM-NETBSD:#define __UINT32_TYPE__ unsigned int +// ARM-NETBSD:#define __UINT64_C(c) c##ULL // ARM-NETBSD:#define __UINT64_C_SUFFIX__ ULL // ARM-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL // ARM-NETBSD:#define __UINT64_TYPE__ long long unsigned int +// ARM-NETBSD:#define __UINT8_C(c) c // ARM-NETBSD:#define __UINT8_C_SUFFIX__ // ARM-NETBSD:#define __UINT8_MAX__ 255 // ARM-NETBSD:#define __UINT8_TYPE__ unsigned char +// ARM-NETBSD:#define __UINTMAX_C(c) c##ULL // ARM-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL // ARM-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL // ARM-NETBSD:#define __UINTMAX_TYPE__ long long unsigned int @@ -1372,6 +1442,7 @@ // RUN: %clang -E -dM -ffreestanding -target arm-netbsd-eabihf %s -o - | FileCheck -match-full-lines -check-prefix ARMHF-NETBSD %s // ARMHF-NETBSD:#define __SIZE_WIDTH__ 32 // ARMHF-NETBSD-NOT:#define __SOFTFP__ 1 +// ARMHF-NETBSD:#define __UINT16_C(c) c // ARMHF-NETBSD:#define __UINT16_C_SUFFIX__ // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s diff --git a/clang/test/Preprocessor/init-csky.c b/clang/test/Preprocessor/init-csky.c index f7868e02644aa..99c5ad1010edb 100644 --- a/clang/test/Preprocessor/init-csky.c +++ b/clang/test/Preprocessor/init-csky.c @@ -66,18 +66,23 @@ // CSKY: #define __GNUC__ {{.*}} // CSKY: #define __GXX_ABI_VERSION {{.*}} // CSKY: #define __ILP32__ 1 +// CSKY: #define __INT16_C(c) c // CSKY: #define __INT16_C_SUFFIX__ // CSKY: #define __INT16_MAX__ 32767 // CSKY: #define __INT16_TYPE__ short +// CSKY: #define __INT32_C(c) c // CSKY: #define __INT32_C_SUFFIX__ // CSKY: #define __INT32_MAX__ 2147483647 // CSKY: #define __INT32_TYPE__ int +// CSKY: #define __INT64_C(c) c##LL // CSKY: #define __INT64_C_SUFFIX__ LL // CSKY: #define __INT64_MAX__ 9223372036854775807LL // CSKY: #define __INT64_TYPE__ long long int +// CSKY: #define __INT8_C(c) c // CSKY: #define __INT8_C_SUFFIX__ // CSKY: #define __INT8_MAX__ 127 // CSKY: #define __INT8_TYPE__ signed char +// CSKY: #define __INTMAX_C(c) c##LL // CSKY: #define __INTMAX_C_SUFFIX__ LL // CSKY: #define __INTMAX_MAX__ 9223372036854775807LL // CSKY: #define __INTMAX_TYPE__ long long int @@ -152,18 +157,23 @@ // CSKY: #define __STDC_UTF_32__ 1 // CSKY: #define __STDC_VERSION__ 201710L // CSKY: #define __STDC__ 1 +// CSKY: #define __UINT16_C(c) c // CSKY: #define __UINT16_C_SUFFIX__ // CSKY: #define __UINT16_MAX__ 65535 // CSKY: #define __UINT16_TYPE__ unsigned short +// CSKY: #define __UINT32_C(c) c##U // CSKY: #define __UINT32_C_SUFFIX__ U // CSKY: #define __UINT32_MAX__ 4294967295U // CSKY: #define __UINT32_TYPE__ unsigned int +// CSKY: #define __UINT64_C(c) c##ULL // CSKY: #define __UINT64_C_SUFFIX__ ULL // CSKY: #define __UINT64_MAX__ 18446744073709551615ULL // CSKY: #define __UINT64_TYPE__ long long unsigned int +// CSKY: #define __UINT8_C(c) c // CSKY: #define __UINT8_C_SUFFIX__ // CSKY: #define __UINT8_MAX__ 255 // CSKY: #define __UINT8_TYPE__ unsigned char +// CSKY: #define __UINTMAX_C(c) c##ULL // CSKY: #define __UINTMAX_C_SUFFIX__ ULL // CSKY: #define __UINTMAX_MAX__ 18446744073709551615ULL // CSKY: #define __UINTMAX_TYPE__ long long unsigned int diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c index f6fd603dc39c0..ac461b371162f 100644 --- a/clang/test/Preprocessor/init-loongarch.c +++ b/clang/test/Preprocessor/init-loongarch.c @@ -85,26 +85,31 @@ // LA32: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1 // LA32: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1 // LA32: #define __ILP32__ 1 +// LA32: #define __INT16_C(c) c // LA32: #define __INT16_C_SUFFIX__ // LA32: #define __INT16_FMTd__ "hd" // LA32: #define __INT16_FMTi__ "hi" // LA32: #define __INT16_MAX__ 32767 // LA32: #define __INT16_TYPE__ short +// LA32: #define __INT32_C(c) c // LA32: #define __INT32_C_SUFFIX__ // LA32: #define __INT32_FMTd__ "d" // LA32: #define __INT32_FMTi__ "i" // LA32: #define __INT32_MAX__ 2147483647 // LA32: #define __INT32_TYPE__ int +// LA32: #define __INT64_C(c) c##LL // LA32: #define __INT64_C_SUFFIX__ LL // LA32: #define __INT64_FMTd__ "lld" // LA32: #define __INT64_FMTi__ "lli" // LA32: #define __INT64_MAX__ 9223372036854775807LL // LA32: #define __INT64_TYPE__ long long int +// LA32: #define __INT8_C(c) c // LA32: #define __INT8_C_SUFFIX__ // LA32: #define __INT8_FMTd__ "hhd" // LA32: #define __INT8_FMTi__ "hhi" // LA32: #define __INT8_MAX__ 127 // LA32: #define __INT8_TYPE__ signed char +// LA32: #define __INTMAX_C(c) c##LL // LA32: #define __INTMAX_C_SUFFIX__ LL // LA32: #define __INTMAX_FMTd__ "lld" // LA32: #define __INTMAX_FMTi__ "lli" @@ -227,6 +232,7 @@ // LA32: #define __STDC_UTF_32__ 1 // LA32: #define __STDC_VERSION__ 201710L // LA32: #define __STDC__ 1 +// LA32: #define __UINT16_C(c) c // LA32: #define __UINT16_C_SUFFIX__ // LA32: #define __UINT16_FMTX__ "hX" // LA32: #define __UINT16_FMTo__ "ho" @@ -234,6 +240,7 @@ // LA32: #define __UINT16_FMTx__ "hx" // LA32: #define __UINT16_MAX__ 65535 // LA32: #define __UINT16_TYPE__ unsigned short +// LA32: #define __UINT32_C(c) c##U // LA32: #define __UINT32_C_SUFFIX__ U // LA32: #define __UINT32_FMTX__ "X" // LA32: #define __UINT32_FMTo__ "o" @@ -241,6 +248,7 @@ // LA32: #define __UINT32_FMTx__ "x" // LA32: #define __UINT32_MAX__ 4294967295U // LA32: #define __UINT32_TYPE__ unsigned int +// LA32: #define __UINT64_C(c) c##ULL // LA32: #define __UINT64_C_SUFFIX__ ULL // LA32: #define __UINT64_FMTX__ "llX" // LA32: #define __UINT64_FMTo__ "llo" @@ -248,6 +256,7 @@ // LA32: #define __UINT64_FMTx__ "llx" // LA32: #define __UINT64_MAX__ 18446744073709551615ULL // LA32: #define __UINT64_TYPE__ long long unsigned int +// LA32: #define __UINT8_C(c) c // LA32: #define __UINT8_C_SUFFIX__ // LA32: #define __UINT8_FMTX__ "hhX" // LA32: #define __UINT8_FMTo__ "hho" @@ -255,6 +264,7 @@ // LA32: #define __UINT8_FMTx__ "hhx" // LA32: #define __UINT8_MAX__ 255 // LA32: #define __UINT8_TYPE__ unsigned char +// LA32: #define __UINTMAX_C(c) c##ULL // LA32: #define __UINTMAX_C_SUFFIX__ ULL // LA32: #define __UINTMAX_FMTX__ "llX" // LA32: #define __UINTMAX_FMTo__ "llo" @@ -406,26 +416,31 @@ // LA64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1 // LA64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1 // LA64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1 +// LA64: #define __INT16_C(c) c // LA64: #define __INT16_C_SUFFIX__ // LA64: #define __INT16_FMTd__ "hd" // LA64: #define __INT16_FMTi__ "hi" // LA64: #define __INT16_MAX__ 32767 // LA64: #define __INT16_TYPE__ short +// LA64: #define __INT32_C(c) c // LA64: #define __INT32_C_SUFFIX__ // LA64: #define __INT32_FMTd__ "d" // LA64: #define __INT32_FMTi__ "i" // LA64: #define __INT32_MAX__ 2147483647 // LA64: #define __INT32_TYPE__ int +// LA64: #define __INT64_C(c) c##L // LA64: #define __INT64_C_SUFFIX__ L // LA64: #define __INT64_FMTd__ "ld" // LA64: #define __INT64_FMTi__ "li" // LA64: #define __INT64_MAX__ 9223372036854775807L // LA64: #define __INT64_TYPE__ long int +// LA64: #define __INT8_C(c) c // LA64: #define __INT8_C_SUFFIX__ // LA64: #define __INT8_FMTd__ "hhd" // LA64: #define __INT8_FMTi__ "hhi" // LA64: #define __INT8_MAX__ 127 // LA64: #define __INT8_TYPE__ signed char +// LA64: #define __INTMAX_C(c) c##L // LA64: #define __INTMAX_C_SUFFIX__ L // LA64: #define __INTMAX_FMTd__ "ld" // LA64: #define __INTMAX_FMTi__ "li" @@ -549,6 +564,7 @@ // LA64: #define __STDC_UTF_32__ 1 // LA64: #define __STDC_VERSION__ 201710L // LA64: #define __STDC__ 1 +// LA64: #define __UINT16_C(c) c // LA64: #define __UINT16_C_SUFFIX__ // LA64: #define __UINT16_FMTX__ "hX" // LA64: #define __UINT16_FMTo__ "ho" @@ -556,6 +572,7 @@ // LA64: #define __UINT16_FMTx__ "hx" // LA64: #define __UINT16_MAX__ 65535 // LA64: #define __UINT16_TYPE__ unsigned short +// LA64: #define __UINT32_C(c) c##U // LA64: #define __UINT32_C_SUFFIX__ U // LA64: #define __UINT32_FMTX__ "X" // LA64: #define __UINT32_FMTo__ "o" @@ -563,6 +580,7 @@ // LA64: #define __UINT32_FMTx__ "x" // LA64: #define __UINT32_MAX__ 4294967295U // LA64: #define __UINT32_TYPE__ unsigned int +// LA64: #define __UINT64_C(c) c##UL // LA64: #define __UINT64_C_SUFFIX__ UL // LA64: #define __UINT64_FMTX__ "lX" // LA64: #define __UINT64_FMTo__ "lo" @@ -570,6 +588,7 @@ // LA64: #define __UINT64_FMTx__ "lx" // LA64: #define __UINT64_MAX__ 18446744073709551615UL // LA64: #define __UINT64_TYPE__ long unsigned int +// LA64: #define __UINT8_C(c) c // LA64: #define __UINT8_C_SUFFIX__ // LA64: #define __UINT8_FMTX__ "hhX" // LA64: #define __UINT8_FMTo__ "hho" @@ -577,6 +596,7 @@ // LA64: #define __UINT8_FMTx__ "hhx" // LA64: #define __UINT8_MAX__ 255 // LA64: #define __UINT8_TYPE__ unsigned char +// LA64: #define __UINTMAX_C(c) c##UL // LA64: #define __UINTMAX_C_SUFFIX__ UL // LA64: #define __UINTMAX_FMTX__ "lX" // LA64: #define __UINTMAX_FMTo__ "lo" diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c index 34091ea3690da..4fead33bd826e 100644 --- a/clang/test/Preprocessor/init-mips.c +++ b/clang/test/Preprocessor/init-mips.c @@ -49,26 +49,31 @@ // MIPS32BE:#define __FLT_MIN_EXP__ (-125) // MIPS32BE:#define __FLT_MIN__ 1.17549435e-38F // MIPS32BE:#define __FLT_RADIX__ 2 +// MIPS32BE:#define __INT16_C(c) c // MIPS32BE:#define __INT16_C_SUFFIX__ // MIPS32BE:#define __INT16_FMTd__ "hd" // MIPS32BE:#define __INT16_FMTi__ "hi" // MIPS32BE:#define __INT16_MAX__ 32767 // MIPS32BE:#define __INT16_TYPE__ short +// MIPS32BE:#define __INT32_C(c) c // MIPS32BE:#define __INT32_C_SUFFIX__ // MIPS32BE:#define __INT32_FMTd__ "d" // MIPS32BE:#define __INT32_FMTi__ "i" // MIPS32BE:#define __INT32_MAX__ 2147483647 // MIPS32BE:#define __INT32_TYPE__ int +// MIPS32BE:#define __INT64_C(c) c##LL // MIPS32BE:#define __INT64_C_SUFFIX__ LL // MIPS32BE:#define __INT64_FMTd__ "lld" // MIPS32BE:#define __INT64_FMTi__ "lli" // MIPS32BE:#define __INT64_MAX__ 9223372036854775807LL // MIPS32BE:#define __INT64_TYPE__ long long int +// MIPS32BE:#define __INT8_C(c) c // MIPS32BE:#define __INT8_C_SUFFIX__ // MIPS32BE:#define __INT8_FMTd__ "hhd" // MIPS32BE:#define __INT8_FMTi__ "hhi" // MIPS32BE:#define __INT8_MAX__ 127 // MIPS32BE:#define __INT8_TYPE__ signed char +// MIPS32BE:#define __INTMAX_C(c) c##LL // MIPS32BE:#define __INTMAX_C_SUFFIX__ LL // MIPS32BE:#define __INTMAX_FMTd__ "lld" // MIPS32BE:#define __INTMAX_FMTi__ "lli" @@ -159,18 +164,23 @@ // MIPS32BE:#define __STDC_HOSTED__ 0 // MIPS32BE-C:#define __STDC_VERSION__ 201710L // MIPS32BE:#define __STDC__ 1 +// MIPS32BE:#define __UINT16_C(c) c // MIPS32BE:#define __UINT16_C_SUFFIX__ // MIPS32BE:#define __UINT16_MAX__ 65535 // MIPS32BE:#define __UINT16_TYPE__ unsigned short +// MIPS32BE:#define __UINT32_C(c) c##U // MIPS32BE:#define __UINT32_C_SUFFIX__ U // MIPS32BE:#define __UINT32_MAX__ 4294967295U // MIPS32BE:#define __UINT32_TYPE__ unsigned int +// MIPS32BE:#define __UINT64_C(c) c##ULL // MIPS32BE:#define __UINT64_C_SUFFIX__ ULL // MIPS32BE:#define __UINT64_MAX__ 18446744073709551615ULL // MIPS32BE:#define __UINT64_TYPE__ long long unsigned int +// MIPS32BE:#define __UINT8_C(c) c // MIPS32BE:#define __UINT8_C_SUFFIX__ // MIPS32BE:#define __UINT8_MAX__ 255 // MIPS32BE:#define __UINT8_TYPE__ unsigned char +// MIPS32BE:#define __UINTMAX_C(c) c##ULL // MIPS32BE:#define __UINTMAX_C_SUFFIX__ ULL // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int @@ -259,26 +269,31 @@ // MIPS32EL:#define __FLT_MIN_EXP__ (-125) // MIPS32EL:#define __FLT_MIN__ 1.17549435e-38F // MIPS32EL:#define __FLT_RADIX__ 2 +// MIPS32EL:#define __INT16_C(c) c // MIPS32EL:#define __INT16_C_SUFFIX__ // MIPS32EL:#define __INT16_FMTd__ "hd" // MIPS32EL:#define __INT16_FMTi__ "hi" // MIPS32EL:#define __INT16_MAX__ 32767 // MIPS32EL:#define __INT16_TYPE__ short +// MIPS32EL:#define __INT32_C(c) c // MIPS32EL:#define __INT32_C_SUFFIX__ // MIPS32EL:#define __INT32_FMTd__ "d" // MIPS32EL:#define __INT32_FMTi__ "i" // MIPS32EL:#define __INT32_MAX__ 2147483647 // MIPS32EL:#define __INT32_TYPE__ int +// MIPS32EL:#define __INT64_C(c) c##LL // MIPS32EL:#define __INT64_C_SUFFIX__ LL // MIPS32EL:#define __INT64_FMTd__ "lld" // MIPS32EL:#define __INT64_FMTi__ "lli" // MIPS32EL:#define __INT64_MAX__ 9223372036854775807LL // MIPS32EL:#define __INT64_TYPE__ long long int +// MIPS32EL:#define __INT8_C(c) c // MIPS32EL:#define __INT8_C_SUFFIX__ // MIPS32EL:#define __INT8_FMTd__ "hhd" // MIPS32EL:#define __INT8_FMTi__ "hhi" // MIPS32EL:#define __INT8_MAX__ 127 // MIPS32EL:#define __INT8_TYPE__ signed char +// MIPS32EL:#define __INTMAX_C(c) c##LL // MIPS32EL:#define __INTMAX_C_SUFFIX__ LL // MIPS32EL:#define __INTMAX_FMTd__ "lld" // MIPS32EL:#define __INTMAX_FMTi__ "lli" @@ -366,18 +381,23 @@ // MIPS32EL:#define __SIZE_MAX__ 4294967295U // MIPS32EL:#define __SIZE_TYPE__ unsigned int // MIPS32EL:#define __SIZE_WIDTH__ 32 +// MIPS32EL:#define __UINT16_C(c) c // MIPS32EL:#define __UINT16_C_SUFFIX__ // MIPS32EL:#define __UINT16_MAX__ 65535 // MIPS32EL:#define __UINT16_TYPE__ unsigned short +// MIPS32EL:#define __UINT32_C(c) c##U // MIPS32EL:#define __UINT32_C_SUFFIX__ U // MIPS32EL:#define __UINT32_MAX__ 4294967295U // MIPS32EL:#define __UINT32_TYPE__ unsigned int +// MIPS32EL:#define __UINT64_C(c) c##ULL // MIPS32EL:#define __UINT64_C_SUFFIX__ ULL // MIPS32EL:#define __UINT64_MAX__ 18446744073709551615ULL // MIPS32EL:#define __UINT64_TYPE__ long long unsigned int +// MIPS32EL:#define __UINT8_C(c) c // MIPS32EL:#define __UINT8_C_SUFFIX__ // MIPS32EL:#define __UINT8_MAX__ 255 // MIPS32EL:#define __UINT8_TYPE__ unsigned char +// MIPS32EL:#define __UINTMAX_C(c) c##ULL // MIPS32EL:#define __UINTMAX_C_SUFFIX__ ULL // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int @@ -496,26 +516,31 @@ // MIPSN32BE: #define __GNUC__ 4 // MIPSN32BE: #define __GXX_ABI_VERSION 1002 // MIPSN32BE: #define __ILP32__ 1 +// MIPSN32BE: #define __INT16_C(c) c // MIPSN32BE: #define __INT16_C_SUFFIX__ // MIPSN32BE: #define __INT16_FMTd__ "hd" // MIPSN32BE: #define __INT16_FMTi__ "hi" // MIPSN32BE: #define __INT16_MAX__ 32767 // MIPSN32BE: #define __INT16_TYPE__ short +// MIPSN32BE: #define __INT32_C(c) c // MIPSN32BE: #define __INT32_C_SUFFIX__ // MIPSN32BE: #define __INT32_FMTd__ "d" // MIPSN32BE: #define __INT32_FMTi__ "i" // MIPSN32BE: #define __INT32_MAX__ 2147483647 // MIPSN32BE: #define __INT32_TYPE__ int +// MIPSN32BE: #define __INT64_C(c) c##LL // MIPSN32BE: #define __INT64_C_SUFFIX__ LL // MIPSN32BE: #define __INT64_FMTd__ "lld" // MIPSN32BE: #define __INT64_FMTi__ "lli" // MIPSN32BE: #define __INT64_MAX__ 9223372036854775807LL // MIPSN32BE: #define __INT64_TYPE__ long long int +// MIPSN32BE: #define __INT8_C(c) c // MIPSN32BE: #define __INT8_C_SUFFIX__ // MIPSN32BE: #define __INT8_FMTd__ "hhd" // MIPSN32BE: #define __INT8_FMTi__ "hhi" // MIPSN32BE: #define __INT8_MAX__ 127 // MIPSN32BE: #define __INT8_TYPE__ signed char +// MIPSN32BE: #define __INTMAX_C(c) c##LL // MIPSN32BE: #define __INTMAX_C_SUFFIX__ LL // MIPSN32BE: #define __INTMAX_FMTd__ "lld" // MIPSN32BE: #define __INTMAX_FMTi__ "lli" @@ -618,6 +643,7 @@ // MIPSN32BE: #define __STDC_UTF_32__ 1 // MIPSN32BE-C: #define __STDC_VERSION__ 201710L // MIPSN32BE: #define __STDC__ 1 +// MIPSN32BE: #define __UINT16_C(c) c // MIPSN32BE: #define __UINT16_C_SUFFIX__ // MIPSN32BE: #define __UINT16_FMTX__ "hX" // MIPSN32BE: #define __UINT16_FMTo__ "ho" @@ -625,6 +651,7 @@ // MIPSN32BE: #define __UINT16_FMTx__ "hx" // MIPSN32BE: #define __UINT16_MAX__ 65535 // MIPSN32BE: #define __UINT16_TYPE__ unsigned short +// MIPSN32BE: #define __UINT32_C(c) c##U // MIPSN32BE: #define __UINT32_C_SUFFIX__ U // MIPSN32BE: #define __UINT32_FMTX__ "X" // MIPSN32BE: #define __UINT32_FMTo__ "o" @@ -632,6 +659,7 @@ // MIPSN32BE: #define __UINT32_FMTx__ "x" // MIPSN32BE: #define __UINT32_MAX__ 4294967295U // MIPSN32BE: #define __UINT32_TYPE__ unsigned int +// MIPSN32BE: #define __UINT64_C(c) c##ULL // MIPSN32BE: #define __UINT64_C_SUFFIX__ ULL // MIPSN32BE: #define __UINT64_FMTX__ "llX" // MIPSN32BE: #define __UINT64_FMTo__ "llo" @@ -639,6 +667,7 @@ // MIPSN32BE: #define __UINT64_FMTx__ "llx" // MIPSN32BE: #define __UINT64_MAX__ 18446744073709551615ULL // MIPSN32BE: #define __UINT64_TYPE__ long long unsigned int +// MIPSN32BE: #define __UINT8_C(c) c // MIPSN32BE: #define __UINT8_C_SUFFIX__ // MIPSN32BE: #define __UINT8_FMTX__ "hhX" // MIPSN32BE: #define __UINT8_FMTo__ "hho" @@ -646,6 +675,7 @@ // MIPSN32BE: #define __UINT8_FMTx__ "hhx" // MIPSN32BE: #define __UINT8_MAX__ 255 // MIPSN32BE: #define __UINT8_TYPE__ unsigned char +// MIPSN32BE: #define __UINTMAX_C(c) c##ULL // MIPSN32BE: #define __UINTMAX_C_SUFFIX__ ULL // MIPSN32BE: #define __UINTMAX_FMTX__ "llX" // MIPSN32BE: #define __UINTMAX_FMTo__ "llo" @@ -803,26 +833,31 @@ // MIPSN32EL: #define __GNUC__ 4 // MIPSN32EL: #define __GXX_ABI_VERSION 1002 // MIPSN32EL: #define __ILP32__ 1 +// MIPSN32EL: #define __INT16_C(c) c // MIPSN32EL: #define __INT16_C_SUFFIX__ // MIPSN32EL: #define __INT16_FMTd__ "hd" // MIPSN32EL: #define __INT16_FMTi__ "hi" // MIPSN32EL: #define __INT16_MAX__ 32767 // MIPSN32EL: #define __INT16_TYPE__ short +// MIPSN32EL: #define __INT32_C(c) c // MIPSN32EL: #define __INT32_C_SUFFIX__ // MIPSN32EL: #define __INT32_FMTd__ "d" // MIPSN32EL: #define __INT32_FMTi__ "i" // MIPSN32EL: #define __INT32_MAX__ 2147483647 // MIPSN32EL: #define __INT32_TYPE__ int +// MIPSN32EL: #define __INT64_C(c) c##LL // MIPSN32EL: #define __INT64_C_SUFFIX__ LL // MIPSN32EL: #define __INT64_FMTd__ "lld" // MIPSN32EL: #define __INT64_FMTi__ "lli" // MIPSN32EL: #define __INT64_MAX__ 9223372036854775807LL // MIPSN32EL: #define __INT64_TYPE__ long long int +// MIPSN32EL: #define __INT8_C(c) c // MIPSN32EL: #define __INT8_C_SUFFIX__ // MIPSN32EL: #define __INT8_FMTd__ "hhd" // MIPSN32EL: #define __INT8_FMTi__ "hhi" // MIPSN32EL: #define __INT8_MAX__ 127 // MIPSN32EL: #define __INT8_TYPE__ signed char +// MIPSN32EL: #define __INTMAX_C(c) c##LL // MIPSN32EL: #define __INTMAX_C_SUFFIX__ LL // MIPSN32EL: #define __INTMAX_FMTd__ "lld" // MIPSN32EL: #define __INTMAX_FMTi__ "lli" @@ -925,6 +960,7 @@ // MIPSN32EL: #define __STDC_UTF_32__ 1 // MIPSN32EL: #define __STDC_VERSION__ 201710L // MIPSN32EL: #define __STDC__ 1 +// MIPSN32EL: #define __UINT16_C(c) c // MIPSN32EL: #define __UINT16_C_SUFFIX__ // MIPSN32EL: #define __UINT16_FMTX__ "hX" // MIPSN32EL: #define __UINT16_FMTo__ "ho" @@ -932,6 +968,7 @@ // MIPSN32EL: #define __UINT16_FMTx__ "hx" // MIPSN32EL: #define __UINT16_MAX__ 65535 // MIPSN32EL: #define __UINT16_TYPE__ unsigned short +// MIPSN32EL: #define __UINT32_C(c) c##U // MIPSN32EL: #define __UINT32_C_SUFFIX__ U // MIPSN32EL: #define __UINT32_FMTX__ "X" // MIPSN32EL: #define __UINT32_FMTo__ "o" @@ -939,6 +976,7 @@ // MIPSN32EL: #define __UINT32_FMTx__ "x" // MIPSN32EL: #define __UINT32_MAX__ 4294967295U // MIPSN32EL: #define __UINT32_TYPE__ unsigned int +// MIPSN32EL: #define __UINT64_C(c) c##ULL // MIPSN32EL: #define __UINT64_C_SUFFIX__ ULL // MIPSN32EL: #define __UINT64_FMTX__ "llX" // MIPSN32EL: #define __UINT64_FMTo__ "llo" @@ -946,6 +984,7 @@ // MIPSN32EL: #define __UINT64_FMTx__ "llx" // MIPSN32EL: #define __UINT64_MAX__ 18446744073709551615ULL // MIPSN32EL: #define __UINT64_TYPE__ long long unsigned int +// MIPSN32EL: #define __UINT8_C(c) c // MIPSN32EL: #define __UINT8_C_SUFFIX__ // MIPSN32EL: #define __UINT8_FMTX__ "hhX" // MIPSN32EL: #define __UINT8_FMTo__ "hho" @@ -953,6 +992,7 @@ // MIPSN32EL: #define __UINT8_FMTx__ "hhx" // MIPSN32EL: #define __UINT8_MAX__ 255 // MIPSN32EL: #define __UINT8_TYPE__ unsigned char +// MIPSN32EL: #define __UINTMAX_C(c) c##ULL // MIPSN32EL: #define __UINTMAX_C_SUFFIX__ ULL // MIPSN32EL: #define __UINTMAX_FMTX__ "llX" // MIPSN32EL: #define __UINTMAX_FMTo__ "llo" @@ -1086,26 +1126,31 @@ // MIPS64BE:#define __FLT_MIN_EXP__ (-125) // MIPS64BE:#define __FLT_MIN__ 1.17549435e-38F // MIPS64BE:#define __FLT_RADIX__ 2 +// MIPS64BE:#define __INT16_C(c) c // MIPS64BE:#define __INT16_C_SUFFIX__ // MIPS64BE:#define __INT16_FMTd__ "hd" // MIPS64BE:#define __INT16_FMTi__ "hi" // MIPS64BE:#define __INT16_MAX__ 32767 // MIPS64BE:#define __INT16_TYPE__ short +// MIPS64BE:#define __INT32_C(c) c // MIPS64BE:#define __INT32_C_SUFFIX__ // MIPS64BE:#define __INT32_FMTd__ "d" // MIPS64BE:#define __INT32_FMTi__ "i" // MIPS64BE:#define __INT32_MAX__ 2147483647 // MIPS64BE:#define __INT32_TYPE__ int +// MIPS64BE:#define __INT64_C(c) c##L // MIPS64BE:#define __INT64_C_SUFFIX__ L // MIPS64BE:#define __INT64_FMTd__ "ld" // MIPS64BE:#define __INT64_FMTi__ "li" // MIPS64BE:#define __INT64_MAX__ 9223372036854775807L // MIPS64BE:#define __INT64_TYPE__ long int +// MIPS64BE:#define __INT8_C(c) c // MIPS64BE:#define __INT8_C_SUFFIX__ // MIPS64BE:#define __INT8_FMTd__ "hhd" // MIPS64BE:#define __INT8_FMTi__ "hhi" // MIPS64BE:#define __INT8_MAX__ 127 // MIPS64BE:#define __INT8_TYPE__ signed char +// MIPS64BE:#define __INTMAX_C(c) c##L // MIPS64BE:#define __INTMAX_C_SUFFIX__ L // MIPS64BE:#define __INTMAX_FMTd__ "ld" // MIPS64BE:#define __INTMAX_FMTi__ "li" @@ -1194,18 +1239,23 @@ // MIPS64BE:#define __SIZE_TYPE__ long unsigned int // MIPS64BE:#define __SIZE_WIDTH__ 64 // MIPS64BE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL +// MIPS64BE:#define __UINT16_C(c) c // MIPS64BE:#define __UINT16_C_SUFFIX__ // MIPS64BE:#define __UINT16_MAX__ 65535 // MIPS64BE:#define __UINT16_TYPE__ unsigned short +// MIPS64BE:#define __UINT32_C(c) c##U // MIPS64BE:#define __UINT32_C_SUFFIX__ U // MIPS64BE:#define __UINT32_MAX__ 4294967295U // MIPS64BE:#define __UINT32_TYPE__ unsigned int +// MIPS64BE:#define __UINT64_C(c) c##UL // MIPS64BE:#define __UINT64_C_SUFFIX__ UL // MIPS64BE:#define __UINT64_MAX__ 18446744073709551615UL // MIPS64BE:#define __UINT64_TYPE__ long unsigned int +// MIPS64BE:#define __UINT8_C(c) c // MIPS64BE:#define __UINT8_C_SUFFIX__ // MIPS64BE:#define __UINT8_MAX__ 255 // MIPS64BE:#define __UINT8_TYPE__ unsigned char +// MIPS64BE:#define __UINTMAX_C(c) c##UL // MIPS64BE:#define __UINTMAX_C_SUFFIX__ UL // MIPS64BE:#define __UINTMAX_MAX__ 18446744073709551615UL // MIPS64BE:#define __UINTMAX_TYPE__ long unsigned int @@ -1296,26 +1346,31 @@ // MIPS64EL:#define __FLT_MIN_EXP__ (-125) // MIPS64EL:#define __FLT_MIN__ 1.17549435e-38F // MIPS64EL:#define __FLT_RADIX__ 2 +// MIPS64EL:#define __INT16_C(c) c // MIPS64EL:#define __INT16_C_SUFFIX__ // MIPS64EL:#define __INT16_FMTd__ "hd" // MIPS64EL:#define __INT16_FMTi__ "hi" // MIPS64EL:#define __INT16_MAX__ 32767 // MIPS64EL:#define __INT16_TYPE__ short +// MIPS64EL:#define __INT32_C(c) c // MIPS64EL:#define __INT32_C_SUFFIX__ // MIPS64EL:#define __INT32_FMTd__ "d" // MIPS64EL:#define __INT32_FMTi__ "i" // MIPS64EL:#define __INT32_MAX__ 2147483647 // MIPS64EL:#define __INT32_TYPE__ int +// MIPS64EL:#define __INT64_C(c) c##L // MIPS64EL:#define __INT64_C_SUFFIX__ L // MIPS64EL:#define __INT64_FMTd__ "ld" // MIPS64EL:#define __INT64_FMTi__ "li" // MIPS64EL:#define __INT64_MAX__ 9223372036854775807L // MIPS64EL:#define __INT64_TYPE__ long int +// MIPS64EL:#define __INT8_C(c) c // MIPS64EL:#define __INT8_C_SUFFIX__ // MIPS64EL:#define __INT8_FMTd__ "hhd" // MIPS64EL:#define __INT8_FMTi__ "hhi" // MIPS64EL:#define __INT8_MAX__ 127 // MIPS64EL:#define __INT8_TYPE__ signed char +// MIPS64EL:#define __INTMAX_C(c) c##L // MIPS64EL:#define __INTMAX_C_SUFFIX__ L // MIPS64EL:#define __INTMAX_FMTd__ "ld" // MIPS64EL:#define __INTMAX_FMTi__ "li" @@ -1404,18 +1459,23 @@ // MIPS64EL:#define __SIZE_MAX__ 18446744073709551615UL // MIPS64EL:#define __SIZE_TYPE__ long unsigned int // MIPS64EL:#define __SIZE_WIDTH__ 64 +// MIPS64EL:#define __UINT16_C(c) c // MIPS64EL:#define __UINT16_C_SUFFIX__ // MIPS64EL:#define __UINT16_MAX__ 65535 // MIPS64EL:#define __UINT16_TYPE__ unsigned short +// MIPS64EL:#define __UINT32_C(c) c##U // MIPS64EL:#define __UINT32_C_SUFFIX__ U // MIPS64EL:#define __UINT32_MAX__ 4294967295U // MIPS64EL:#define __UINT32_TYPE__ unsigned int +// MIPS64EL:#define __UINT64_C(c) c##UL // MIPS64EL:#define __UINT64_C_SUFFIX__ UL // MIPS64EL:#define __UINT64_MAX__ 18446744073709551615UL // MIPS64EL:#define __UINT64_TYPE__ long unsigned int +// MIPS64EL:#define __UINT8_C(c) c // MIPS64EL:#define __UINT8_C_SUFFIX__ // MIPS64EL:#define __UINT8_MAX__ 255 // MIPS64EL:#define __UINT8_TYPE__ unsigned char +// MIPS64EL:#define __UINTMAX_C(c) c##UL // MIPS64EL:#define __UINTMAX_C_SUFFIX__ UL // MIPS64EL:#define __UINTMAX_MAX__ 18446744073709551615UL // MIPS64EL:#define __UINTMAX_TYPE__ long unsigned int diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c index 1421b102a3dfd..6b7eceda9b97b 100644 --- a/clang/test/Preprocessor/init-ppc.c +++ b/clang/test/Preprocessor/init-ppc.c @@ -41,26 +41,31 @@ // PPC603E:#define __FLT_MIN_EXP__ (-125) // PPC603E:#define __FLT_MIN__ 1.17549435e-38F // PPC603E:#define __FLT_RADIX__ 2 +// PPC603E:#define __INT16_C(c) c // PPC603E:#define __INT16_C_SUFFIX__ // PPC603E:#define __INT16_FMTd__ "hd" // PPC603E:#define __INT16_FMTi__ "hi" // PPC603E:#define __INT16_MAX__ 32767 // PPC603E:#define __INT16_TYPE__ short +// PPC603E:#define __INT32_C(c) c // PPC603E:#define __INT32_C_SUFFIX__ // PPC603E:#define __INT32_FMTd__ "d" // PPC603E:#define __INT32_FMTi__ "i" // PPC603E:#define __INT32_MAX__ 2147483647 // PPC603E:#define __INT32_TYPE__ int +// PPC603E:#define __INT64_C(c) c##LL // PPC603E:#define __INT64_C_SUFFIX__ LL // PPC603E:#define __INT64_FMTd__ "lld" // PPC603E:#define __INT64_FMTi__ "lli" // PPC603E:#define __INT64_MAX__ 9223372036854775807LL // PPC603E:#define __INT64_TYPE__ long long int +// PPC603E:#define __INT8_C(c) c // PPC603E:#define __INT8_C_SUFFIX__ // PPC603E:#define __INT8_FMTd__ "hhd" // PPC603E:#define __INT8_FMTi__ "hhi" // PPC603E:#define __INT8_MAX__ 127 // PPC603E:#define __INT8_TYPE__ signed char +// PPC603E:#define __INTMAX_C(c) c##LL // PPC603E:#define __INTMAX_C_SUFFIX__ LL // PPC603E:#define __INTMAX_FMTd__ "lld" // PPC603E:#define __INTMAX_FMTi__ "lli" @@ -150,18 +155,23 @@ // PPC603E:#define __SIZE_TYPE__ long unsigned int // PPC603E:#define __SIZE_WIDTH__ 32 // PPC603E-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL +// PPC603E:#define __UINT16_C(c) c // PPC603E:#define __UINT16_C_SUFFIX__ // PPC603E:#define __UINT16_MAX__ 65535 // PPC603E:#define __UINT16_TYPE__ unsigned short +// PPC603E:#define __UINT32_C(c) c##U // PPC603E:#define __UINT32_C_SUFFIX__ U // PPC603E:#define __UINT32_MAX__ 4294967295U // PPC603E:#define __UINT32_TYPE__ unsigned int +// PPC603E:#define __UINT64_C(c) c##ULL // PPC603E:#define __UINT64_C_SUFFIX__ ULL // PPC603E:#define __UINT64_MAX__ 18446744073709551615ULL // PPC603E:#define __UINT64_TYPE__ long long unsigned int +// PPC603E:#define __UINT8_C(c) c // PPC603E:#define __UINT8_C_SUFFIX__ // PPC603E:#define __UINT8_MAX__ 255 // PPC603E:#define __UINT8_TYPE__ unsigned char +// PPC603E:#define __UINTMAX_C(c) c##ULL // PPC603E:#define __UINTMAX_C_SUFFIX__ ULL // PPC603E:#define __UINTMAX_MAX__ 18446744073709551615ULL // PPC603E:#define __UINTMAX_TYPE__ long long unsigned int @@ -235,26 +245,31 @@ // PPC:#define __FLT_MIN__ 1.17549435e-38F // PPC:#define __FLT_RADIX__ 2 // PPC:#define __HAVE_BSWAP__ 1 +// PPC:#define __INT16_C(c) c // PPC:#define __INT16_C_SUFFIX__ // PPC:#define __INT16_FMTd__ "hd" // PPC:#define __INT16_FMTi__ "hi" // PPC:#define __INT16_MAX__ 32767 // PPC:#define __INT16_TYPE__ short +// PPC:#define __INT32_C(c) c // PPC:#define __INT32_C_SUFFIX__ // PPC:#define __INT32_FMTd__ "d" // PPC:#define __INT32_FMTi__ "i" // PPC:#define __INT32_MAX__ 2147483647 // PPC:#define __INT32_TYPE__ int +// PPC:#define __INT64_C(c) c##LL // PPC:#define __INT64_C_SUFFIX__ LL // PPC:#define __INT64_FMTd__ "lld" // PPC:#define __INT64_FMTi__ "lli" // PPC:#define __INT64_MAX__ 9223372036854775807LL // PPC:#define __INT64_TYPE__ long long int +// PPC:#define __INT8_C(c) c // PPC:#define __INT8_C_SUFFIX__ // PPC:#define __INT8_FMTd__ "hhd" // PPC:#define __INT8_FMTi__ "hhi" // PPC:#define __INT8_MAX__ 127 // PPC:#define __INT8_TYPE__ signed char +// PPC:#define __INTMAX_C(c) c##LL // PPC:#define __INTMAX_C_SUFFIX__ LL // PPC:#define __INTMAX_FMTd__ "lld" // PPC:#define __INTMAX_FMTi__ "lli" @@ -344,18 +359,23 @@ // PPC:#define __SIZE_MAX__ 4294967295UL // PPC:#define __SIZE_TYPE__ long unsigned int // PPC:#define __SIZE_WIDTH__ 32 +// PPC:#define __UINT16_C(c) c // PPC:#define __UINT16_C_SUFFIX__ // PPC:#define __UINT16_MAX__ 65535 // PPC:#define __UINT16_TYPE__ unsigned short +// PPC:#define __UINT32_C(c) c##U // PPC:#define __UINT32_C_SUFFIX__ U // PPC:#define __UINT32_MAX__ 4294967295U // PPC:#define __UINT32_TYPE__ unsigned int +// PPC:#define __UINT64_C(c) c##ULL // PPC:#define __UINT64_C_SUFFIX__ ULL // PPC:#define __UINT64_MAX__ 18446744073709551615ULL // PPC:#define __UINT64_TYPE__ long long unsigned int +// PPC:#define __UINT8_C(c) c // PPC:#define __UINT8_C_SUFFIX__ // PPC:#define __UINT8_MAX__ 255 // PPC:#define __UINT8_TYPE__ unsigned char +// PPC:#define __UINTMAX_C(c) c##ULL // PPC:#define __UINTMAX_C_SUFFIX__ ULL // PPC:#define __UINTMAX_MAX__ 18446744073709551615ULL // PPC:#define __UINTMAX_TYPE__ long long unsigned int @@ -435,26 +455,31 @@ // PPC-AIX:#define __FLT_MIN__ 1.17549435e-38F // PPC-AIX:#define __FLT_RADIX__ 2 // PPC-AIX:#define __HOS_AIX__ 1 +// PPC-AIX:#define __INT16_C(c) c // PPC-AIX:#define __INT16_C_SUFFIX__ // PPC-AIX:#define __INT16_FMTd__ "hd" // PPC-AIX:#define __INT16_FMTi__ "hi" // PPC-AIX:#define __INT16_MAX__ 32767 // PPC-AIX:#define __INT16_TYPE__ short +// PPC-AIX:#define __INT32_C(c) c // PPC-AIX:#define __INT32_C_SUFFIX__ // PPC-AIX:#define __INT32_FMTd__ "d" // PPC-AIX:#define __INT32_FMTi__ "i" // PPC-AIX:#define __INT32_MAX__ 2147483647 // PPC-AIX:#define __INT32_TYPE__ int +// PPC-AIX:#define __INT64_C(c) c##LL // PPC-AIX:#define __INT64_C_SUFFIX__ LL // PPC-AIX:#define __INT64_FMTd__ "lld" // PPC-AIX:#define __INT64_FMTi__ "lli" // PPC-AIX:#define __INT64_MAX__ 9223372036854775807LL // PPC-AIX:#define __INT64_TYPE__ long long int +// PPC-AIX:#define __INT8_C(c) c // PPC-AIX:#define __INT8_C_SUFFIX__ // PPC-AIX:#define __INT8_FMTd__ "hhd" // PPC-AIX:#define __INT8_FMTi__ "hhi" // PPC-AIX:#define __INT8_MAX__ 127 // PPC-AIX:#define __INT8_TYPE__ signed char +// PPC-AIX:#define __INTMAX_C(c) c##LL // PPC-AIX:#define __INTMAX_C_SUFFIX__ LL // PPC-AIX:#define __INTMAX_FMTd__ "lld" // PPC-AIX:#define __INTMAX_FMTi__ "lli" @@ -546,18 +571,23 @@ // PPC-AIX:#define __THW_BIG_ENDIAN__ 1 // PPC-AIX:#define __THW_PPC__ 1 // PPC-AIX:#define __TOS_AIX__ 1 +// PPC-AIX:#define __UINT16_C(c) c // PPC-AIX:#define __UINT16_C_SUFFIX__ // PPC-AIX:#define __UINT16_MAX__ 65535 // PPC-AIX:#define __UINT16_TYPE__ unsigned short +// PPC-AIX:#define __UINT32_C(c) c##U // PPC-AIX:#define __UINT32_C_SUFFIX__ U // PPC-AIX:#define __UINT32_MAX__ 4294967295U // PPC-AIX:#define __UINT32_TYPE__ unsigned int +// PPC-AIX:#define __UINT64_C(c) c##ULL // PPC-AIX:#define __UINT64_C_SUFFIX__ ULL // PPC-AIX:#define __UINT64_MAX__ 18446744073709551615ULL // PPC-AIX:#define __UINT64_TYPE__ long long unsigned int +// PPC-AIX:#define __UINT8_C(c) c // PPC-AIX:#define __UINT8_C_SUFFIX__ // PPC-AIX:#define __UINT8_MAX__ 255 // PPC-AIX:#define __UINT8_TYPE__ unsigned char +// PPC-AIX:#define __UINTMAX_C(c) c##ULL // PPC-AIX:#define __UINTMAX_C_SUFFIX__ ULL // PPC-AIX:#define __UINTMAX_MAX__ 18446744073709551615ULL // PPC-AIX:#define __UINTMAX_TYPE__ long long unsigned int @@ -807,26 +837,31 @@ // PPC-LINUX:#define __FLT_MIN__ 1.17549435e-38F // PPC-LINUX:#define __FLT_RADIX__ 2 // PPC-LINUX:#define __HAVE_BSWAP__ 1 +// PPC-LINUX:#define __INT16_C(c) c // PPC-LINUX:#define __INT16_C_SUFFIX__ // PPC-LINUX:#define __INT16_FMTd__ "hd" // PPC-LINUX:#define __INT16_FMTi__ "hi" // PPC-LINUX:#define __INT16_MAX__ 32767 // PPC-LINUX:#define __INT16_TYPE__ short +// PPC-LINUX:#define __INT32_C(c) c // PPC-LINUX:#define __INT32_C_SUFFIX__ // PPC-LINUX:#define __INT32_FMTd__ "d" // PPC-LINUX:#define __INT32_FMTi__ "i" // PPC-LINUX:#define __INT32_MAX__ 2147483647 // PPC-LINUX:#define __INT32_TYPE__ int +// PPC-LINUX:#define __INT64_C(c) c##LL // PPC-LINUX:#define __INT64_C_SUFFIX__ LL // PPC-LINUX:#define __INT64_FMTd__ "lld" // PPC-LINUX:#define __INT64_FMTi__ "lli" // PPC-LINUX:#define __INT64_MAX__ 9223372036854775807LL // PPC-LINUX:#define __INT64_TYPE__ long long int +// PPC-LINUX:#define __INT8_C(c) c // PPC-LINUX:#define __INT8_C_SUFFIX__ // PPC-LINUX:#define __INT8_FMTd__ "hhd" // PPC-LINUX:#define __INT8_FMTi__ "hhi" // PPC-LINUX:#define __INT8_MAX__ 127 // PPC-LINUX:#define __INT8_TYPE__ signed char +// PPC-LINUX:#define __INTMAX_C(c) c##LL // PPC-LINUX:#define __INTMAX_C_SUFFIX__ LL // PPC-LINUX:#define __INTMAX_FMTd__ "lld" // PPC-LINUX:#define __INTMAX_FMTi__ "lli" @@ -915,18 +950,23 @@ // PPC-LINUX:#define __SIZE_MAX__ 4294967295U // PPC-LINUX:#define __SIZE_TYPE__ unsigned int // PPC-LINUX:#define __SIZE_WIDTH__ 32 +// PPC-LINUX:#define __UINT16_C(c) c // PPC-LINUX:#define __UINT16_C_SUFFIX__ // PPC-LINUX:#define __UINT16_MAX__ 65535 // PPC-LINUX:#define __UINT16_TYPE__ unsigned short +// PPC-LINUX:#define __UINT32_C(c) c##U // PPC-LINUX:#define __UINT32_C_SUFFIX__ U // PPC-LINUX:#define __UINT32_MAX__ 4294967295U // PPC-LINUX:#define __UINT32_TYPE__ unsigned int +// PPC-LINUX:#define __UINT64_C(c) c##ULL // PPC-LINUX:#define __UINT64_C_SUFFIX__ ULL // PPC-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL // PPC-LINUX:#define __UINT64_TYPE__ long long unsigned int +// PPC-LINUX:#define __UINT8_C(c) c // PPC-LINUX:#define __UINT8_C_SUFFIX__ // PPC-LINUX:#define __UINT8_MAX__ 255 // PPC-LINUX:#define __UINT8_TYPE__ unsigned char +// PPC-LINUX:#define __UINTMAX_C(c) c##ULL // PPC-LINUX:#define __UINTMAX_C_SUFFIX__ ULL // PPC-LINUX:#define __UINTMAX_MAX__ 18446744073709551615ULL // PPC-LINUX:#define __UINTMAX_TYPE__ long long unsigned int diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index 57e2ca31d5d53..7dffd4627481b 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -47,26 +47,31 @@ // PPC64:#define __FLT_MIN__ 1.17549435e-38F // PPC64:#define __FLT_RADIX__ 2 // PPC64:#define __HAVE_BSWAP__ 1 +// PPC64:#define __INT16_C(c) c // PPC64:#define __INT16_C_SUFFIX__ // PPC64:#define __INT16_FMTd__ "hd" // PPC64:#define __INT16_FMTi__ "hi" // PPC64:#define __INT16_MAX__ 32767 // PPC64:#define __INT16_TYPE__ short +// PPC64:#define __INT32_C(c) c // PPC64:#define __INT32_C_SUFFIX__ // PPC64:#define __INT32_FMTd__ "d" // PPC64:#define __INT32_FMTi__ "i" // PPC64:#define __INT32_MAX__ 2147483647 // PPC64:#define __INT32_TYPE__ int +// PPC64:#define __INT64_C(c) c##L // PPC64:#define __INT64_C_SUFFIX__ L // PPC64:#define __INT64_FMTd__ "ld" // PPC64:#define __INT64_FMTi__ "li" // PPC64:#define __INT64_MAX__ 9223372036854775807L // PPC64:#define __INT64_TYPE__ long int +// PPC64:#define __INT8_C(c) c // PPC64:#define __INT8_C_SUFFIX__ // PPC64:#define __INT8_FMTd__ "hhd" // PPC64:#define __INT8_FMTi__ "hhi" // PPC64:#define __INT8_MAX__ 127 // PPC64:#define __INT8_TYPE__ signed char +// PPC64:#define __INTMAX_C(c) c##L // PPC64:#define __INTMAX_C_SUFFIX__ L // PPC64:#define __INTMAX_FMTd__ "ld" // PPC64:#define __INTMAX_FMTi__ "li" @@ -157,18 +162,23 @@ // PPC64:#define __SIZE_TYPE__ long unsigned int // PPC64:#define __SIZE_WIDTH__ 64 // PPC64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL +// PPC64:#define __UINT16_C(c) c // PPC64:#define __UINT16_C_SUFFIX__ // PPC64:#define __UINT16_MAX__ 65535 // PPC64:#define __UINT16_TYPE__ unsigned short +// PPC64:#define __UINT32_C(c) c##U // PPC64:#define __UINT32_C_SUFFIX__ U // PPC64:#define __UINT32_MAX__ 4294967295U // PPC64:#define __UINT32_TYPE__ unsigned int +// PPC64:#define __UINT64_C(c) c##UL // PPC64:#define __UINT64_C_SUFFIX__ UL // PPC64:#define __UINT64_MAX__ 18446744073709551615UL // PPC64:#define __UINT64_TYPE__ long unsigned int +// PPC64:#define __UINT8_C(c) c // PPC64:#define __UINT8_C_SUFFIX__ // PPC64:#define __UINT8_MAX__ 255 // PPC64:#define __UINT8_TYPE__ unsigned char +// PPC64:#define __UINTMAX_C(c) c##UL // PPC64:#define __UINTMAX_C_SUFFIX__ UL // PPC64:#define __UINTMAX_MAX__ 18446744073709551615UL // PPC64:#define __UINTMAX_TYPE__ long unsigned int @@ -250,26 +260,31 @@ // PPC64LE:#define __FLT_MIN__ 1.17549435e-38F // PPC64LE:#define __FLT_RADIX__ 2 // PPC64LE:#define __HAVE_BSWAP__ 1 +// PPC64LE:#define __INT16_C(c) c // PPC64LE:#define __INT16_C_SUFFIX__ // PPC64LE:#define __INT16_FMTd__ "hd" // PPC64LE:#define __INT16_FMTi__ "hi" // PPC64LE:#define __INT16_MAX__ 32767 // PPC64LE:#define __INT16_TYPE__ short +// PPC64LE:#define __INT32_C(c) c // PPC64LE:#define __INT32_C_SUFFIX__ // PPC64LE:#define __INT32_FMTd__ "d" // PPC64LE:#define __INT32_FMTi__ "i" // PPC64LE:#define __INT32_MAX__ 2147483647 // PPC64LE:#define __INT32_TYPE__ int +// PPC64LE:#define __INT64_C(c) c##L // PPC64LE:#define __INT64_C_SUFFIX__ L // PPC64LE:#define __INT64_FMTd__ "ld" // PPC64LE:#define __INT64_FMTi__ "li" // PPC64LE:#define __INT64_MAX__ 9223372036854775807L // PPC64LE:#define __INT64_TYPE__ long int +// PPC64LE:#define __INT8_C(c) c // PPC64LE:#define __INT8_C_SUFFIX__ // PPC64LE:#define __INT8_FMTd__ "hhd" // PPC64LE:#define __INT8_FMTi__ "hhi" // PPC64LE:#define __INT8_MAX__ 127 // PPC64LE:#define __INT8_TYPE__ signed char +// PPC64LE:#define __INTMAX_C(c) c##L // PPC64LE:#define __INTMAX_C_SUFFIX__ L // PPC64LE:#define __INTMAX_FMTd__ "ld" // PPC64LE:#define __INTMAX_FMTi__ "li" @@ -361,18 +376,23 @@ // PPC64LE:#define __SIZE_TYPE__ long unsigned int // PPC64LE:#define __SIZE_WIDTH__ 64 // PPC64LE:#define __STRUCT_PARM_ALIGN__ 16 +// PPC64LE:#define __UINT16_C(c) c // PPC64LE:#define __UINT16_C_SUFFIX__ // PPC64LE:#define __UINT16_MAX__ 65535 // PPC64LE:#define __UINT16_TYPE__ unsigned short +// PPC64LE:#define __UINT32_C(c) c##U // PPC64LE:#define __UINT32_C_SUFFIX__ U // PPC64LE:#define __UINT32_MAX__ 4294967295U // PPC64LE:#define __UINT32_TYPE__ unsigned int +// PPC64LE:#define __UINT64_C(c) c##UL // PPC64LE:#define __UINT64_C_SUFFIX__ UL // PPC64LE:#define __UINT64_MAX__ 18446744073709551615UL // PPC64LE:#define __UINT64_TYPE__ long unsigned int +// PPC64LE:#define __UINT8_C(c) c // PPC64LE:#define __UINT8_C_SUFFIX__ // PPC64LE:#define __UINT8_MAX__ 255 // PPC64LE:#define __UINT8_TYPE__ unsigned char +// PPC64LE:#define __UINTMAX_C(c) c##UL // PPC64LE:#define __UINTMAX_C_SUFFIX__ UL // PPC64LE:#define __UINTMAX_MAX__ 18446744073709551615UL // PPC64LE:#define __UINTMAX_TYPE__ long unsigned int @@ -733,26 +753,31 @@ // PPC64-AIX:#define __FLT_MIN__ 1.17549435e-38F // PPC64-AIX:#define __FLT_RADIX__ 2 // PPC64-AIX-NOT:#define __ILP32__ 1 +// PPC64-AIX:#define __INT16_C(c) c // PPC64-AIX:#define __INT16_C_SUFFIX__ // PPC64-AIX:#define __INT16_FMTd__ "hd" // PPC64-AIX:#define __INT16_FMTi__ "hi" // PPC64-AIX:#define __INT16_MAX__ 32767 // PPC64-AIX:#define __INT16_TYPE__ short +// PPC64-AIX:#define __INT32_C(c) c // PPC64-AIX:#define __INT32_C_SUFFIX__ // PPC64-AIX:#define __INT32_FMTd__ "d" // PPC64-AIX:#define __INT32_FMTi__ "i" // PPC64-AIX:#define __INT32_MAX__ 2147483647 // PPC64-AIX:#define __INT32_TYPE__ int +// PPC64-AIX:#define __INT64_C(c) c##L // PPC64-AIX:#define __INT64_C_SUFFIX__ L // PPC64-AIX:#define __INT64_FMTd__ "ld" // PPC64-AIX:#define __INT64_FMTi__ "li" // PPC64-AIX:#define __INT64_MAX__ 9223372036854775807L // PPC64-AIX:#define __INT64_TYPE__ long int +// PPC64-AIX:#define __INT8_C(c) c // PPC64-AIX:#define __INT8_C_SUFFIX__ // PPC64-AIX:#define __INT8_FMTd__ "hhd" // PPC64-AIX:#define __INT8_FMTi__ "hhi" // PPC64-AIX:#define __INT8_MAX__ 127 // PPC64-AIX:#define __INT8_TYPE__ signed char +// PPC64-AIX:#define __INTMAX_C(c) c##L // PPC64-AIX:#define __INTMAX_C_SUFFIX__ L // PPC64-AIX:#define __INTMAX_FMTd__ "ld" // PPC64-AIX:#define __INTMAX_FMTi__ "li" @@ -842,18 +867,23 @@ // PPC64-AIX:#define __SIZE_MAX__ 18446744073709551615UL // PPC64-AIX:#define __SIZE_TYPE__ long unsigned int // PPC64-AIX:#define __SIZE_WIDTH__ 64 +// PPC64-AIX:#define __UINT16_C(c) c // PPC64-AIX:#define __UINT16_C_SUFFIX__ // PPC64-AIX:#define __UINT16_MAX__ 65535 // PPC64-AIX:#define __UINT16_TYPE__ unsigned short +// PPC64-AIX:#define __UINT32_C(c) c##U // PPC64-AIX:#define __UINT32_C_SUFFIX__ U // PPC64-AIX:#define __UINT32_MAX__ 4294967295U // PPC64-AIX:#define __UINT32_TYPE__ unsigned int +// PPC64-AIX:#define __UINT64_C(c) c##UL // PPC64-AIX:#define __UINT64_C_SUFFIX__ UL // PPC64-AIX:#define __UINT64_MAX__ 18446744073709551615UL // PPC64-AIX:#define __UINT64_TYPE__ long unsigned int +// PPC64-AIX:#define __UINT8_C(c) c // PPC64-AIX:#define __UINT8_C_SUFFIX__ // PPC64-AIX:#define __UINT8_MAX__ 255 // PPC64-AIX:#define __UINT8_TYPE__ unsigned char +// PPC64-AIX:#define __UINTMAX_C(c) c##UL // PPC64-AIX:#define __UINTMAX_C_SUFFIX__ UL // PPC64-AIX:#define __UINTMAX_MAX__ 18446744073709551615UL // PPC64-AIX:#define __UINTMAX_TYPE__ long unsigned int @@ -930,26 +960,31 @@ // PPC64-LINUX:#define __FLT_MIN__ 1.17549435e-38F // PPC64-LINUX:#define __FLT_RADIX__ 2 // PPC64-LINUX:#define __HAVE_BSWAP__ 1 +// PPC64-LINUX:#define __INT16_C(c) c // PPC64-LINUX:#define __INT16_C_SUFFIX__ // PPC64-LINUX:#define __INT16_FMTd__ "hd" // PPC64-LINUX:#define __INT16_FMTi__ "hi" // PPC64-LINUX:#define __INT16_MAX__ 32767 // PPC64-LINUX:#define __INT16_TYPE__ short +// PPC64-LINUX:#define __INT32_C(c) c // PPC64-LINUX:#define __INT32_C_SUFFIX__ // PPC64-LINUX:#define __INT32_FMTd__ "d" // PPC64-LINUX:#define __INT32_FMTi__ "i" // PPC64-LINUX:#define __INT32_MAX__ 2147483647 // PPC64-LINUX:#define __INT32_TYPE__ int +// PPC64-LINUX:#define __INT64_C(c) c##L // PPC64-LINUX:#define __INT64_C_SUFFIX__ L // PPC64-LINUX:#define __INT64_FMTd__ "ld" // PPC64-LINUX:#define __INT64_FMTi__ "li" // PPC64-LINUX:#define __INT64_MAX__ 9223372036854775807L // PPC64-LINUX:#define __INT64_TYPE__ long int +// PPC64-LINUX:#define __INT8_C(c) c // PPC64-LINUX:#define __INT8_C_SUFFIX__ // PPC64-LINUX:#define __INT8_FMTd__ "hhd" // PPC64-LINUX:#define __INT8_FMTi__ "hhi" // PPC64-LINUX:#define __INT8_MAX__ 127 // PPC64-LINUX:#define __INT8_TYPE__ signed char +// PPC64-LINUX:#define __INTMAX_C(c) c##L // PPC64-LINUX:#define __INTMAX_C_SUFFIX__ L // PPC64-LINUX:#define __INTMAX_FMTd__ "ld" // PPC64-LINUX:#define __INTMAX_FMTi__ "li" @@ -1039,18 +1074,23 @@ // PPC64-LINUX:#define __SIZE_MAX__ 18446744073709551615UL // PPC64-LINUX:#define __SIZE_TYPE__ long unsigned int // PPC64-LINUX:#define __SIZE_WIDTH__ 64 +// PPC64-LINUX:#define __UINT16_C(c) c // PPC64-LINUX:#define __UINT16_C_SUFFIX__ // PPC64-LINUX:#define __UINT16_MAX__ 65535 // PPC64-LINUX:#define __UINT16_TYPE__ unsigned short +// PPC64-LINUX:#define __UINT32_C(c) c##U // PPC64-LINUX:#define __UINT32_C_SUFFIX__ U // PPC64-LINUX:#define __UINT32_MAX__ 4294967295U // PPC64-LINUX:#define __UINT32_TYPE__ unsigned int +// PPC64-LINUX:#define __UINT64_C(c) c##UL // PPC64-LINUX:#define __UINT64_C_SUFFIX__ UL // PPC64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL // PPC64-LINUX:#define __UINT64_TYPE__ long unsigned int +// PPC64-LINUX:#define __UINT8_C(c) c // PPC64-LINUX:#define __UINT8_C_SUFFIX__ // PPC64-LINUX:#define __UINT8_MAX__ 255 // PPC64-LINUX:#define __UINT8_TYPE__ unsigned char +// PPC64-LINUX:#define __UINTMAX_C(c) c##UL // PPC64-LINUX:#define __UINTMAX_C_SUFFIX__ UL // PPC64-LINUX:#define __UINTMAX_MAX__ 18446744073709551615UL // PPC64-LINUX:#define __UINTMAX_TYPE__ long unsigned int diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c index 6d08e9bfcb632..a8fbde46cbb75 100644 --- a/clang/test/Preprocessor/init-s390x.c +++ b/clang/test/Preprocessor/init-s390x.c @@ -34,26 +34,31 @@ // S390X:#define __FLT_MIN_EXP__ (-125) // S390X:#define __FLT_MIN__ 1.17549435e-38F // S390X:#define __FLT_RADIX__ 2 +// S390X:#define __INT16_C(c) c // S390X:#define __INT16_C_SUFFIX__ // S390X:#define __INT16_FMTd__ "hd" // S390X:#define __INT16_FMTi__ "hi" // S390X:#define __INT16_MAX__ 32767 // S390X:#define __INT16_TYPE__ short +// S390X:#define __INT32_C(c) c // S390X:#define __INT32_C_SUFFIX__ // S390X:#define __INT32_FMTd__ "d" // S390X:#define __INT32_FMTi__ "i" // S390X:#define __INT32_MAX__ 2147483647 // S390X:#define __INT32_TYPE__ int +// S390X:#define __INT64_C(c) c##L // S390X:#define __INT64_C_SUFFIX__ L // S390X:#define __INT64_FMTd__ "ld" // S390X:#define __INT64_FMTi__ "li" // S390X:#define __INT64_MAX__ 9223372036854775807L // S390X:#define __INT64_TYPE__ long int +// S390X:#define __INT8_C(c) c // S390X:#define __INT8_C_SUFFIX__ // S390X:#define __INT8_FMTd__ "hhd" // S390X:#define __INT8_FMTi__ "hhi" // S390X:#define __INT8_MAX__ 127 // S390X:#define __INT8_TYPE__ signed char +// S390X:#define __INTMAX_C(c) c##L // S390X:#define __INTMAX_C_SUFFIX__ L // S390X:#define __INTMAX_FMTd__ "ld" // S390X:#define __INTMAX_FMTi__ "li" @@ -136,18 +141,23 @@ // S390X:#define __SIZE_TYPE__ long unsigned int // S390X:#define __SIZE_WIDTH__ 64 // S390X-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8UL +// S390X:#define __UINT16_C(c) c // S390X:#define __UINT16_C_SUFFIX__ // S390X:#define __UINT16_MAX__ 65535 // S390X:#define __UINT16_TYPE__ unsigned short +// S390X:#define __UINT32_C(c) c##U // S390X:#define __UINT32_C_SUFFIX__ U // S390X:#define __UINT32_MAX__ 4294967295U // S390X:#define __UINT32_TYPE__ unsigned int +// S390X:#define __UINT64_C(c) c##UL // S390X:#define __UINT64_C_SUFFIX__ UL // S390X:#define __UINT64_MAX__ 18446744073709551615UL // S390X:#define __UINT64_TYPE__ long unsigned int +// S390X:#define __UINT8_C(c) c // S390X:#define __UINT8_C_SUFFIX__ // S390X:#define __UINT8_MAX__ 255 // S390X:#define __UINT8_TYPE__ unsigned char +// S390X:#define __UINTMAX_C(c) c##UL // S390X:#define __UINTMAX_C_SUFFIX__ UL // S390X:#define __UINTMAX_MAX__ 18446744073709551615UL // S390X:#define __UINTMAX_TYPE__ long unsigned int diff --git a/clang/test/Preprocessor/init-v7k-compat.c b/clang/test/Preprocessor/init-v7k-compat.c index ff5d4bbdea53a..a9c6e7a290646 100644 --- a/clang/test/Preprocessor/init-v7k-compat.c +++ b/clang/test/Preprocessor/init-v7k-compat.c @@ -39,26 +39,31 @@ // CHECK: #define __FLT_MIN_EXP__ (-125) // CHECK: #define __FLT_MIN__ 1.17549435e-38F // CHECK: #define __FLT_RADIX__ 2 +// CHECK: #define __INT16_C(c) c // CHECK: #define __INT16_C_SUFFIX__ {{$}} // CHECK: #define __INT16_FMTd__ "hd" // CHECK: #define __INT16_FMTi__ "hi" // CHECK: #define __INT16_MAX__ 32767 // CHECK: #define __INT16_TYPE__ short +// CHECK: #define __INT32_C(c) c // CHECK: #define __INT32_C_SUFFIX__ {{$}} // CHECK: #define __INT32_FMTd__ "d" // CHECK: #define __INT32_FMTi__ "i" // CHECK: #define __INT32_MAX__ 2147483647 // CHECK: #define __INT32_TYPE__ int +// CHECK: #define __INT64_C(c) c##LL // CHECK: #define __INT64_C_SUFFIX__ LL // CHECK: #define __INT64_FMTd__ "lld" // CHECK: #define __INT64_FMTi__ "lli" // CHECK: #define __INT64_MAX__ 9223372036854775807LL // CHECK: #define __INT64_TYPE__ long long int +// CHECK: #define __INT8_C(c) c // CHECK: #define __INT8_C_SUFFIX__ {{$}} // CHECK: #define __INT8_FMTd__ "hhd" // CHECK: #define __INT8_FMTi__ "hhi" // CHECK: #define __INT8_MAX__ 127 // CHECK: #define __INT8_TYPE__ signed char +// CHECK: #define __INTMAX_C(c) c##LL // CHECK: #define __INTMAX_C_SUFFIX__ LL // CHECK: #define __INTMAX_FMTd__ "lld" // CHECK: #define __INTMAX_FMTi__ "lli" @@ -140,18 +145,23 @@ // CHECK: #define __SIZE_MAX__ 4294967295UL // CHECK: #define __SIZE_TYPE__ long unsigned int // CHECK: #define __SIZE_WIDTH__ 32 +// CHECK: #define __UINT16_C(c) c // CHECK: #define __UINT16_C_SUFFIX__ {{$}} // CHECK: #define __UINT16_MAX__ 65535 // CHECK: #define __UINT16_TYPE__ unsigned short +// CHECK: #define __UINT32_C(c) c##U // CHECK: #define __UINT32_C_SUFFIX__ U // CHECK: #define __UINT32_MAX__ 4294967295U // CHECK: #define __UINT32_TYPE__ unsigned int +// CHECK: #define __UINT64_C(c) c##ULL // CHECK: #define __UINT64_C_SUFFIX__ ULL // CHECK: #define __UINT64_MAX__ 18446744073709551615ULL // CHECK: #define __UINT64_TYPE__ long long unsigned int +// CHECK: #define __UINT8_C(c) c // CHECK: #define __UINT8_C_SUFFIX__ {{$}} // CHECK: #define __UINT8_MAX__ 255 // CHECK: #define __UINT8_TYPE__ unsigned char +// CHECK: #define __UINTMAX_C(c) c##ULL // CHECK: #define __UINTMAX_C_SUFFIX__ ULL // CHECK: #define __UINTMAX_MAX__ 18446744073709551615ULL // CHECK: #define __UINTMAX_TYPE__ long long unsigned int diff --git a/clang/test/Preprocessor/init-ve.c b/clang/test/Preprocessor/init-ve.c index 13bdb12387db4..711c2a04865b3 100644 --- a/clang/test/Preprocessor/init-ve.c +++ b/clang/test/Preprocessor/init-ve.c @@ -45,26 +45,31 @@ // VE:#define __FLT_MIN_EXP__ (-125) // VE:#define __FLT_MIN__ 1.17549435e-38F // VE:#define __FLT_RADIX__ 2 +// VE:#define __INT16_C(c) c // VE:#define __INT16_C_SUFFIX__ // VE:#define __INT16_FMTd__ "hd" // VE:#define __INT16_FMTi__ "hi" // VE:#define __INT16_MAX__ 32767 // VE:#define __INT16_TYPE__ short +// VE:#define __INT32_C(c) c // VE:#define __INT32_C_SUFFIX__ // VE:#define __INT32_FMTd__ "d" // VE:#define __INT32_FMTi__ "i" // VE:#define __INT32_MAX__ 2147483647 // VE:#define __INT32_TYPE__ int +// VE:#define __INT64_C(c) c##L // VE:#define __INT64_C_SUFFIX__ L // VE:#define __INT64_FMTd__ "ld" // VE:#define __INT64_FMTi__ "li" // VE:#define __INT64_MAX__ 9223372036854775807L // VE:#define __INT64_TYPE__ long int +// VE:#define __INT8_C(c) c // VE:#define __INT8_C_SUFFIX__ // VE:#define __INT8_FMTd__ "hhd" // VE:#define __INT8_FMTi__ "hhi" // VE:#define __INT8_MAX__ 127 // VE:#define __INT8_TYPE__ signed char +// VE:#define __INTMAX_C(c) c##L // VE:#define __INTMAX_C_SUFFIX__ L // VE:#define __INTMAX_FMTd__ "ld" // VE:#define __INTMAX_FMTi__ "li" @@ -164,6 +169,7 @@ // VE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL // VE-HOSTED:#define __STDC_HOSTED__ 1 // VE-FREESTANDING:#define __STDC_HOSTED__ 0 +// VE:#define __UINT16_C(c) c // VE:#define __UINT16_C_SUFFIX__ // VE:#define __UINT16_FMTX__ "hX" // VE:#define __UINT16_FMTo__ "ho" @@ -171,6 +177,7 @@ // VE:#define __UINT16_FMTx__ "hx" // VE:#define __UINT16_MAX__ 65535 // VE:#define __UINT16_TYPE__ unsigned short +// VE:#define __UINT32_C(c) c##U // VE:#define __UINT32_C_SUFFIX__ U // VE:#define __UINT32_FMTX__ "X" // VE:#define __UINT32_FMTo__ "o" @@ -178,6 +185,7 @@ // VE:#define __UINT32_FMTx__ "x" // VE:#define __UINT32_MAX__ 4294967295U // VE:#define __UINT32_TYPE__ unsigned int +// VE:#define __UINT64_C(c) c##UL // VE:#define __UINT64_C_SUFFIX__ UL // VE:#define __UINT64_FMTX__ "lX" // VE:#define __UINT64_FMTo__ "lo" @@ -185,6 +193,7 @@ // VE:#define __UINT64_FMTx__ "lx" // VE:#define __UINT64_MAX__ 18446744073709551615UL // VE:#define __UINT64_TYPE__ long unsigned int +// VE:#define __UINT8_C(c) c // VE:#define __UINT8_C_SUFFIX__ // VE:#define __UINT8_FMTX__ "hhX" // VE:#define __UINT8_FMTo__ "hho" @@ -192,6 +201,7 @@ // VE:#define __UINT8_FMTx__ "hhx" // VE:#define __UINT8_MAX__ 255 // VE:#define __UINT8_TYPE__ unsigned char +// VE:#define __UINTMAX_C(c) c##UL // VE:#define __UINTMAX_C_SUFFIX__ UL // VE:#define __UINTMAX_FMTX__ "lX" // VE:#define __UINTMAX_FMTo__ "lo" diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c index 6f5aa5674e48e..cb77b5583407c 100644 --- a/clang/test/Preprocessor/init-x86.c +++ b/clang/test/Preprocessor/init-x86.c @@ -35,26 +35,31 @@ // I386:#define __FLT_MIN_EXP__ (-125) // I386:#define __FLT_MIN__ 1.17549435e-38F // I386:#define __FLT_RADIX__ 2 +// I386:#define __INT16_C(c) c // I386:#define __INT16_C_SUFFIX__ // I386:#define __INT16_FMTd__ "hd" // I386:#define __INT16_FMTi__ "hi" // I386:#define __INT16_MAX__ 32767 // I386:#define __INT16_TYPE__ short +// I386:#define __INT32_C(c) c // I386:#define __INT32_C_SUFFIX__ // I386:#define __INT32_FMTd__ "d" // I386:#define __INT32_FMTi__ "i" // I386:#define __INT32_MAX__ 2147483647 // I386:#define __INT32_TYPE__ int +// I386:#define __INT64_C(c) c##LL // I386:#define __INT64_C_SUFFIX__ LL // I386:#define __INT64_FMTd__ "lld" // I386:#define __INT64_FMTi__ "lli" // I386:#define __INT64_MAX__ 9223372036854775807LL // I386:#define __INT64_TYPE__ long long int +// I386:#define __INT8_C(c) c // I386:#define __INT8_C_SUFFIX__ // I386:#define __INT8_FMTd__ "hhd" // I386:#define __INT8_FMTi__ "hhi" // I386:#define __INT8_MAX__ 127 // I386:#define __INT8_TYPE__ signed char +// I386:#define __INTMAX_C(c) c##LL // I386:#define __INTMAX_C_SUFFIX__ LL // I386:#define __INTMAX_FMTd__ "lld" // I386:#define __INTMAX_FMTi__ "lli" @@ -140,18 +145,23 @@ // I386:#define __SIZE_MAX__ 4294967295U // I386:#define __SIZE_TYPE__ unsigned int // I386:#define __SIZE_WIDTH__ 32 +// I386:#define __UINT16_C(c) c // I386:#define __UINT16_C_SUFFIX__ // I386:#define __UINT16_MAX__ 65535 // I386:#define __UINT16_TYPE__ unsigned short +// I386:#define __UINT32_C(c) c##U // I386:#define __UINT32_C_SUFFIX__ U // I386:#define __UINT32_MAX__ 4294967295U // I386:#define __UINT32_TYPE__ unsigned int +// I386:#define __UINT64_C(c) c##ULL // I386:#define __UINT64_C_SUFFIX__ ULL // I386:#define __UINT64_MAX__ 18446744073709551615ULL // I386:#define __UINT64_TYPE__ long long unsigned int +// I386:#define __UINT8_C(c) c // I386:#define __UINT8_C_SUFFIX__ // I386:#define __UINT8_MAX__ 255 // I386:#define __UINT8_TYPE__ unsigned char +// I386:#define __UINTMAX_C(c) c##ULL // I386:#define __UINTMAX_C_SUFFIX__ ULL // I386:#define __UINTMAX_MAX__ 18446744073709551615ULL // I386:#define __UINTMAX_TYPE__ long long unsigned int @@ -235,26 +245,31 @@ // I386-LINUX:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 // I386-LINUX:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 // I386-LINUX:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +// I386-LINUX:#define __INT16_C(c) c // I386-LINUX:#define __INT16_C_SUFFIX__ // I386-LINUX:#define __INT16_FMTd__ "hd" // I386-LINUX:#define __INT16_FMTi__ "hi" // I386-LINUX:#define __INT16_MAX__ 32767 // I386-LINUX:#define __INT16_TYPE__ short +// I386-LINUX:#define __INT32_C(c) c // I386-LINUX:#define __INT32_C_SUFFIX__ // I386-LINUX:#define __INT32_FMTd__ "d" // I386-LINUX:#define __INT32_FMTi__ "i" // I386-LINUX:#define __INT32_MAX__ 2147483647 // I386-LINUX:#define __INT32_TYPE__ int +// I386-LINUX:#define __INT64_C(c) c##LL // I386-LINUX:#define __INT64_C_SUFFIX__ LL // I386-LINUX:#define __INT64_FMTd__ "lld" // I386-LINUX:#define __INT64_FMTi__ "lli" // I386-LINUX:#define __INT64_MAX__ 9223372036854775807LL // I386-LINUX:#define __INT64_TYPE__ long long int +// I386-LINUX:#define __INT8_C(c) c // I386-LINUX:#define __INT8_C_SUFFIX__ // I386-LINUX:#define __INT8_FMTd__ "hhd" // I386-LINUX:#define __INT8_FMTi__ "hhi" // I386-LINUX:#define __INT8_MAX__ 127 // I386-LINUX:#define __INT8_TYPE__ signed char +// I386-LINUX:#define __INTMAX_C(c) c##LL // I386-LINUX:#define __INTMAX_C_SUFFIX__ LL // I386-LINUX:#define __INTMAX_FMTd__ "lld" // I386-LINUX:#define __INTMAX_FMTi__ "lli" @@ -341,18 +356,23 @@ // I386-LINUX:#define __SIZE_TYPE__ unsigned int // I386-LINUX:#define __SIZE_WIDTH__ 32 // I386-LINUX-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U +// I386-LINUX:#define __UINT16_C(c) c // I386-LINUX:#define __UINT16_C_SUFFIX__ // I386-LINUX:#define __UINT16_MAX__ 65535 // I386-LINUX:#define __UINT16_TYPE__ unsigned short +// I386-LINUX:#define __UINT32_C(c) c##U // I386-LINUX:#define __UINT32_C_SUFFIX__ U // I386-LINUX:#define __UINT32_MAX__ 4294967295U // I386-LINUX:#define __UINT32_TYPE__ unsigned int +// I386-LINUX:#define __UINT64_C(c) c##ULL // I386-LINUX:#define __UINT64_C_SUFFIX__ ULL // I386-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL // I386-LINUX:#define __UINT64_TYPE__ long long unsigned int +// I386-LINUX:#define __UINT8_C(c) c // I386-LINUX:#define __UINT8_C_SUFFIX__ // I386-LINUX:#define __UINT8_MAX__ 255 // I386-LINUX:#define __UINT8_TYPE__ unsigned char +// I386-LINUX:#define __UINTMAX_C(c) c##ULL // I386-LINUX:#define __UINTMAX_C_SUFFIX__ ULL // I386-LINUX:#define __UINTMAX_MAX__ 18446744073709551615ULL // I386-LINUX:#define __UINTMAX_TYPE__ long long unsigned int @@ -436,26 +456,31 @@ // I386-NETBSD:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 // I386-NETBSD:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 // I386-NETBSD:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +// I386-NETBSD:#define __INT16_C(c) c // I386-NETBSD:#define __INT16_C_SUFFIX__ // I386-NETBSD:#define __INT16_FMTd__ "hd" // I386-NETBSD:#define __INT16_FMTi__ "hi" // I386-NETBSD:#define __INT16_MAX__ 32767 // I386-NETBSD:#define __INT16_TYPE__ short +// I386-NETBSD:#define __INT32_C(c) c // I386-NETBSD:#define __INT32_C_SUFFIX__ // I386-NETBSD:#define __INT32_FMTd__ "d" // I386-NETBSD:#define __INT32_FMTi__ "i" // I386-NETBSD:#define __INT32_MAX__ 2147483647 // I386-NETBSD:#define __INT32_TYPE__ int +// I386-NETBSD:#define __INT64_C(c) c##LL // I386-NETBSD:#define __INT64_C_SUFFIX__ LL // I386-NETBSD:#define __INT64_FMTd__ "lld" // I386-NETBSD:#define __INT64_FMTi__ "lli" // I386-NETBSD:#define __INT64_MAX__ 9223372036854775807LL // I386-NETBSD:#define __INT64_TYPE__ long long int +// I386-NETBSD:#define __INT8_C(c) c // I386-NETBSD:#define __INT8_C_SUFFIX__ // I386-NETBSD:#define __INT8_FMTd__ "hhd" // I386-NETBSD:#define __INT8_FMTi__ "hhi" // I386-NETBSD:#define __INT8_MAX__ 127 // I386-NETBSD:#define __INT8_TYPE__ signed char +// I386-NETBSD:#define __INTMAX_C(c) c##LL // I386-NETBSD:#define __INTMAX_C_SUFFIX__ LL // I386-NETBSD:#define __INTMAX_FMTd__ "lld" // I386-NETBSD:#define __INTMAX_FMTi__ "lli" @@ -542,18 +567,23 @@ // I386-NETBSD:#define __SIZE_TYPE__ unsigned int // I386-NETBSD:#define __SIZE_WIDTH__ 32 // I386-NETBSD-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 4U +// I386-NETBSD:#define __UINT16_C(c) c // I386-NETBSD:#define __UINT16_C_SUFFIX__ // I386-NETBSD:#define __UINT16_MAX__ 65535 // I386-NETBSD:#define __UINT16_TYPE__ unsigned short +// I386-NETBSD:#define __UINT32_C(c) c##U // I386-NETBSD:#define __UINT32_C_SUFFIX__ U // I386-NETBSD:#define __UINT32_MAX__ 4294967295U // I386-NETBSD:#define __UINT32_TYPE__ unsigned int +// I386-NETBSD:#define __UINT64_C(c) c##ULL // I386-NETBSD:#define __UINT64_C_SUFFIX__ ULL // I386-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL // I386-NETBSD:#define __UINT64_TYPE__ long long unsigned int +// I386-NETBSD:#define __UINT8_C(c) c // I386-NETBSD:#define __UINT8_C_SUFFIX__ // I386-NETBSD:#define __UINT8_MAX__ 255 // I386-NETBSD:#define __UINT8_TYPE__ unsigned char +// I386-NETBSD:#define __UINTMAX_C(c) c##ULL // I386-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL // I386-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL // I386-NETBSD:#define __UINTMAX_TYPE__ long long unsigned int @@ -636,26 +666,31 @@ // X86_64:#define __FLT_MIN_EXP__ (-125) // X86_64:#define __FLT_MIN__ 1.17549435e-38F // X86_64:#define __FLT_RADIX__ 2 +// X86_64:#define __INT16_C(c) c // X86_64:#define __INT16_C_SUFFIX__ // X86_64:#define __INT16_FMTd__ "hd" // X86_64:#define __INT16_FMTi__ "hi" // X86_64:#define __INT16_MAX__ 32767 // X86_64:#define __INT16_TYPE__ short +// X86_64:#define __INT32_C(c) c // X86_64:#define __INT32_C_SUFFIX__ // X86_64:#define __INT32_FMTd__ "d" // X86_64:#define __INT32_FMTi__ "i" // X86_64:#define __INT32_MAX__ 2147483647 // X86_64:#define __INT32_TYPE__ int +// X86_64:#define __INT64_C(c) c##L // X86_64:#define __INT64_C_SUFFIX__ L // X86_64:#define __INT64_FMTd__ "ld" // X86_64:#define __INT64_FMTi__ "li" // X86_64:#define __INT64_MAX__ 9223372036854775807L // X86_64:#define __INT64_TYPE__ long int +// X86_64:#define __INT8_C(c) c // X86_64:#define __INT8_C_SUFFIX__ // X86_64:#define __INT8_FMTd__ "hhd" // X86_64:#define __INT8_FMTi__ "hhi" // X86_64:#define __INT8_MAX__ 127 // X86_64:#define __INT8_TYPE__ signed char +// X86_64:#define __INTMAX_C(c) c##L // X86_64:#define __INTMAX_C_SUFFIX__ L // X86_64:#define __INTMAX_FMTd__ "ld" // X86_64:#define __INTMAX_FMTi__ "li" @@ -748,18 +783,23 @@ // X86_64:#define __SSE_MATH__ 1 // X86_64:#define __SSE__ 1 // X86_64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL +// X86_64:#define __UINT16_C(c) c // X86_64:#define __UINT16_C_SUFFIX__ // X86_64:#define __UINT16_MAX__ 65535 // X86_64:#define __UINT16_TYPE__ unsigned short +// X86_64:#define __UINT32_C(c) c##U // X86_64:#define __UINT32_C_SUFFIX__ U // X86_64:#define __UINT32_MAX__ 4294967295U // X86_64:#define __UINT32_TYPE__ unsigned int +// X86_64:#define __UINT64_C(c) c##UL // X86_64:#define __UINT64_C_SUFFIX__ UL // X86_64:#define __UINT64_MAX__ 18446744073709551615UL // X86_64:#define __UINT64_TYPE__ long unsigned int +// X86_64:#define __UINT8_C(c) c // X86_64:#define __UINT8_C_SUFFIX__ // X86_64:#define __UINT8_MAX__ 255 // X86_64:#define __UINT8_TYPE__ unsigned char +// X86_64:#define __UINTMAX_C(c) c##UL // X86_64:#define __UINTMAX_C_SUFFIX__ UL // X86_64:#define __UINTMAX_MAX__ 18446744073709551615UL // X86_64:#define __UINTMAX_TYPE__ long unsigned int @@ -842,26 +882,31 @@ // X32:#define __FLT_RADIX__ 2 // X32:#define __ILP32__ 1 // X32-NOT:#define __LP64__ 1 +// X32:#define __INT16_C(c) c // X32:#define __INT16_C_SUFFIX__ // X32:#define __INT16_FMTd__ "hd" // X32:#define __INT16_FMTi__ "hi" // X32:#define __INT16_MAX__ 32767 // X32:#define __INT16_TYPE__ short +// X32:#define __INT32_C(c) c // X32:#define __INT32_C_SUFFIX__ // X32:#define __INT32_FMTd__ "d" // X32:#define __INT32_FMTi__ "i" // X32:#define __INT32_MAX__ 2147483647 // X32:#define __INT32_TYPE__ int +// X32:#define __INT64_C(c) c##LL // X32:#define __INT64_C_SUFFIX__ LL // X32:#define __INT64_FMTd__ "lld" // X32:#define __INT64_FMTi__ "lli" // X32:#define __INT64_MAX__ 9223372036854775807LL // X32:#define __INT64_TYPE__ long long int +// X32:#define __INT8_C(c) c // X32:#define __INT8_C_SUFFIX__ // X32:#define __INT8_FMTd__ "hhd" // X32:#define __INT8_FMTi__ "hhi" // X32:#define __INT8_MAX__ 127 // X32:#define __INT8_TYPE__ signed char +// X32:#define __INTMAX_C(c) c##LL // X32:#define __INTMAX_C_SUFFIX__ LL // X32:#define __INTMAX_FMTd__ "lld" // X32:#define __INTMAX_FMTi__ "lli" @@ -952,18 +997,23 @@ // X32:#define __SSE_MATH__ 1 // X32:#define __SSE__ 1 // X32-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16U +// X32:#define __UINT16_C(c) c // X32:#define __UINT16_C_SUFFIX__ // X32:#define __UINT16_MAX__ 65535 // X32:#define __UINT16_TYPE__ unsigned short +// X32:#define __UINT32_C(c) c##U // X32:#define __UINT32_C_SUFFIX__ U // X32:#define __UINT32_MAX__ 4294967295U // X32:#define __UINT32_TYPE__ unsigned int +// X32:#define __UINT64_C(c) c##ULL // X32:#define __UINT64_C_SUFFIX__ ULL // X32:#define __UINT64_MAX__ 18446744073709551615ULL // X32:#define __UINT64_TYPE__ long long unsigned int +// X32:#define __UINT8_C(c) c // X32:#define __UINT8_C_SUFFIX__ // X32:#define __UINT8_MAX__ 255 // X32:#define __UINT8_TYPE__ unsigned char +// X32:#define __UINTMAX_C(c) c##ULL // X32:#define __UINTMAX_C_SUFFIX__ ULL // X32:#define __UINTMAX_MAX__ 18446744073709551615ULL // X32:#define __UINTMAX_TYPE__ long long unsigned int @@ -1046,26 +1096,31 @@ // X86_64-LINUX:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 // X86_64-LINUX:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 // X86_64-LINUX:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +// X86_64-LINUX:#define __INT16_C(c) c // X86_64-LINUX:#define __INT16_C_SUFFIX__ // X86_64-LINUX:#define __INT16_FMTd__ "hd" // X86_64-LINUX:#define __INT16_FMTi__ "hi" // X86_64-LINUX:#define __INT16_MAX__ 32767 // X86_64-LINUX:#define __INT16_TYPE__ short +// X86_64-LINUX:#define __INT32_C(c) c // X86_64-LINUX:#define __INT32_C_SUFFIX__ // X86_64-LINUX:#define __INT32_FMTd__ "d" // X86_64-LINUX:#define __INT32_FMTi__ "i" // X86_64-LINUX:#define __INT32_MAX__ 2147483647 // X86_64-LINUX:#define __INT32_TYPE__ int +// X86_64-LINUX:#define __INT64_C(c) c##L // X86_64-LINUX:#define __INT64_C_SUFFIX__ L // X86_64-LINUX:#define __INT64_FMTd__ "ld" // X86_64-LINUX:#define __INT64_FMTi__ "li" // X86_64-LINUX:#define __INT64_MAX__ 9223372036854775807L // X86_64-LINUX:#define __INT64_TYPE__ long int +// X86_64-LINUX:#define __INT8_C(c) c // X86_64-LINUX:#define __INT8_C_SUFFIX__ // X86_64-LINUX:#define __INT8_FMTd__ "hhd" // X86_64-LINUX:#define __INT8_FMTi__ "hhi" // X86_64-LINUX:#define __INT8_MAX__ 127 // X86_64-LINUX:#define __INT8_TYPE__ signed char +// X86_64-LINUX:#define __INTMAX_C(c) c##L // X86_64-LINUX:#define __INTMAX_C_SUFFIX__ L // X86_64-LINUX:#define __INTMAX_FMTd__ "ld" // X86_64-LINUX:#define __INTMAX_FMTi__ "li" @@ -1156,18 +1211,23 @@ // X86_64-LINUX:#define __SSE2__ 1 // X86_64-LINUX:#define __SSE_MATH__ 1 // X86_64-LINUX:#define __SSE__ 1 +// X86_64-LINUX:#define __UINT16_C(c) c // X86_64-LINUX:#define __UINT16_C_SUFFIX__ // X86_64-LINUX:#define __UINT16_MAX__ 65535 // X86_64-LINUX:#define __UINT16_TYPE__ unsigned short +// X86_64-LINUX:#define __UINT32_C(c) c##U // X86_64-LINUX:#define __UINT32_C_SUFFIX__ U // X86_64-LINUX:#define __UINT32_MAX__ 4294967295U // X86_64-LINUX:#define __UINT32_TYPE__ unsigned int +// X86_64-LINUX:#define __UINT64_C(c) c##UL // X86_64-LINUX:#define __UINT64_C_SUFFIX__ UL // X86_64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL // X86_64-LINUX:#define __UINT64_TYPE__ long unsigned int +// X86_64-LINUX:#define __UINT8_C(c) c // X86_64-LINUX:#define __UINT8_C_SUFFIX__ // X86_64-LINUX:#define __UINT8_MAX__ 255 // X86_64-LINUX:#define __UINT8_TYPE__ unsigned char +// X86_64-LINUX:#define __UINTMAX_C(c) c##UL // X86_64-LINUX:#define __UINTMAX_C_SUFFIX__ UL // X86_64-LINUX:#define __UINTMAX_MAX__ 18446744073709551615UL // X86_64-LINUX:#define __UINTMAX_TYPE__ long unsigned int @@ -1258,26 +1318,31 @@ // X86_64-NETBSD:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 // X86_64-NETBSD:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 // X86_64-NETBSD:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +// X86_64-NETBSD:#define __INT16_C(c) c // X86_64-NETBSD:#define __INT16_C_SUFFIX__ // X86_64-NETBSD:#define __INT16_FMTd__ "hd" // X86_64-NETBSD:#define __INT16_FMTi__ "hi" // X86_64-NETBSD:#define __INT16_MAX__ 32767 // X86_64-NETBSD:#define __INT16_TYPE__ short +// X86_64-NETBSD:#define __INT32_C(c) c // X86_64-NETBSD:#define __INT32_C_SUFFIX__ // X86_64-NETBSD:#define __INT32_FMTd__ "d" // X86_64-NETBSD:#define __INT32_FMTi__ "i" // X86_64-NETBSD:#define __INT32_MAX__ 2147483647 // X86_64-NETBSD:#define __INT32_TYPE__ int +// X86_64-NETBSD:#define __INT64_C(c) c##L // X86_64-NETBSD:#define __INT64_C_SUFFIX__ L // X86_64-NETBSD:#define __INT64_FMTd__ "ld" // X86_64-NETBSD:#define __INT64_FMTi__ "li" // X86_64-NETBSD:#define __INT64_MAX__ 9223372036854775807L // X86_64-NETBSD:#define __INT64_TYPE__ long int +// X86_64-NETBSD:#define __INT8_C(c) c // X86_64-NETBSD:#define __INT8_C_SUFFIX__ // X86_64-NETBSD:#define __INT8_FMTd__ "hhd" // X86_64-NETBSD:#define __INT8_FMTi__ "hhi" // X86_64-NETBSD:#define __INT8_MAX__ 127 // X86_64-NETBSD:#define __INT8_TYPE__ signed char +// X86_64-NETBSD:#define __INTMAX_C(c) c##L // X86_64-NETBSD:#define __INTMAX_C_SUFFIX__ L // X86_64-NETBSD:#define __INTMAX_FMTd__ "ld" // X86_64-NETBSD:#define __INTMAX_FMTi__ "li" @@ -1368,18 +1433,23 @@ // X86_64-NETBSD:#define __SSE2__ 1 // X86_64-NETBSD:#define __SSE_MATH__ 1 // X86_64-NETBSD:#define __SSE__ 1 +// X86_64-NETBSD:#define __UINT16_C(c) c // X86_64-NETBSD:#define __UINT16_C_SUFFIX__ // X86_64-NETBSD:#define __UINT16_MAX__ 65535 // X86_64-NETBSD:#define __UINT16_TYPE__ unsigned short +// X86_64-NETBSD:#define __UINT32_C(c) c##U // X86_64-NETBSD:#define __UINT32_C_SUFFIX__ U // X86_64-NETBSD:#define __UINT32_MAX__ 4294967295U // X86_64-NETBSD:#define __UINT32_TYPE__ unsigned int +// X86_64-NETBSD:#define __UINT64_C(c) c##UL // X86_64-NETBSD:#define __UINT64_C_SUFFIX__ UL // X86_64-NETBSD:#define __UINT64_MAX__ 18446744073709551615UL // X86_64-NETBSD:#define __UINT64_TYPE__ long unsigned int +// X86_64-NETBSD:#define __UINT8_C(c) c // X86_64-NETBSD:#define __UINT8_C_SUFFIX__ // X86_64-NETBSD:#define __UINT8_MAX__ 255 // X86_64-NETBSD:#define __UINT8_TYPE__ unsigned char +// X86_64-NETBSD:#define __UINTMAX_C(c) c##UL // X86_64-NETBSD:#define __UINTMAX_C_SUFFIX__ UL // X86_64-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615UL // X86_64-NETBSD:#define __UINTMAX_TYPE__ long unsigned int diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c index 5999b9c1d1bc3..1ac325d444662 100644 --- a/clang/test/Preprocessor/init.c +++ b/clang/test/Preprocessor/init.c @@ -426,26 +426,31 @@ // MSP430:#define __FLT_MIN_EXP__ (-125) // MSP430:#define __FLT_MIN__ 1.17549435e-38F // MSP430:#define __FLT_RADIX__ 2 +// MSP430:#define __INT16_C(c) c // MSP430:#define __INT16_C_SUFFIX__ // MSP430:#define __INT16_FMTd__ "hd" // MSP430:#define __INT16_FMTi__ "hi" // MSP430:#define __INT16_MAX__ 32767 // MSP430:#define __INT16_TYPE__ short +// MSP430:#define __INT32_C(c) c##L // MSP430:#define __INT32_C_SUFFIX__ L // MSP430:#define __INT32_FMTd__ "ld" // MSP430:#define __INT32_FMTi__ "li" // MSP430:#define __INT32_MAX__ 2147483647L // MSP430:#define __INT32_TYPE__ long int +// MSP430:#define __INT64_C(c) c##LL // MSP430:#define __INT64_C_SUFFIX__ LL // MSP430:#define __INT64_FMTd__ "lld" // MSP430:#define __INT64_FMTi__ "lli" // MSP430:#define __INT64_MAX__ 9223372036854775807LL // MSP430:#define __INT64_TYPE__ long long int +// MSP430:#define __INT8_C(c) c // MSP430:#define __INT8_C_SUFFIX__ // MSP430:#define __INT8_FMTd__ "hhd" // MSP430:#define __INT8_FMTi__ "hhi" // MSP430:#define __INT8_MAX__ 127 // MSP430:#define __INT8_TYPE__ signed char +// MSP430:#define __INTMAX_C(c) c##LL // MSP430:#define __INTMAX_C_SUFFIX__ LL // MSP430:#define __INTMAX_FMTd__ "lld" // MSP430:#define __INTMAX_FMTi__ "lli" @@ -531,18 +536,23 @@ // MSP430:#define __SIZE_TYPE__ unsigned int // MSP430:#define __SIZE_WIDTH__ 16 // MSP430-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 2U +// MSP430:#define __UINT16_C(c) c##U // MSP430:#define __UINT16_C_SUFFIX__ U // MSP430:#define __UINT16_MAX__ 65535U // MSP430:#define __UINT16_TYPE__ unsigned short +// MSP430:#define __UINT32_C(c) c##UL // MSP430:#define __UINT32_C_SUFFIX__ UL // MSP430:#define __UINT32_MAX__ 4294967295UL // MSP430:#define __UINT32_TYPE__ long unsigned int +// MSP430:#define __UINT64_C(c) c##ULL // MSP430:#define __UINT64_C_SUFFIX__ ULL // MSP430:#define __UINT64_MAX__ 18446744073709551615ULL // MSP430:#define __UINT64_TYPE__ long long unsigned int +// MSP430:#define __UINT8_C(c) c // MSP430:#define __UINT8_C_SUFFIX__ // MSP430:#define __UINT8_MAX__ 255 // MSP430:#define __UINT8_TYPE__ unsigned char +// MSP430:#define __UINTMAX_C(c) c##ULL // MSP430:#define __UINTMAX_C_SUFFIX__ ULL // MSP430:#define __UINTMAX_MAX__ 18446744073709551615ULL // MSP430:#define __UINTMAX_TYPE__ long long unsigned int @@ -613,26 +623,31 @@ // NVPTX32:#define __FLT_MIN_EXP__ (-125) // NVPTX32:#define __FLT_MIN__ 1.17549435e-38F // NVPTX32:#define __FLT_RADIX__ 2 +// NVPTX32:#define __INT16_C(c) c // NVPTX32:#define __INT16_C_SUFFIX__ // NVPTX32:#define __INT16_FMTd__ "hd" // NVPTX32:#define __INT16_FMTi__ "hi" // NVPTX32:#define __INT16_MAX__ 32767 // NVPTX32:#define __INT16_TYPE__ short +// NVPTX32:#define __INT32_C(c) c // NVPTX32:#define __INT32_C_SUFFIX__ // NVPTX32:#define __INT32_FMTd__ "d" // NVPTX32:#define __INT32_FMTi__ "i" // NVPTX32:#define __INT32_MAX__ 2147483647 // NVPTX32:#define __INT32_TYPE__ int +// NVPTX32:#define __INT64_C(c) c##LL // NVPTX32:#define __INT64_C_SUFFIX__ LL // NVPTX32:#define __INT64_FMTd__ "lld" // NVPTX32:#define __INT64_FMTi__ "lli" // NVPTX32:#define __INT64_MAX__ 9223372036854775807LL // NVPTX32:#define __INT64_TYPE__ long long int +// NVPTX32:#define __INT8_C(c) c // NVPTX32:#define __INT8_C_SUFFIX__ // NVPTX32:#define __INT8_FMTd__ "hhd" // NVPTX32:#define __INT8_FMTi__ "hhi" // NVPTX32:#define __INT8_MAX__ 127 // NVPTX32:#define __INT8_TYPE__ signed char +// NVPTX32:#define __INTMAX_C(c) c##LL // NVPTX32:#define __INTMAX_C_SUFFIX__ LL // NVPTX32:#define __INTMAX_FMTd__ "lld" // NVPTX32:#define __INTMAX_FMTi__ "lli" @@ -720,18 +735,23 @@ // NVPTX32:#define __SIZE_TYPE__ unsigned int // NVPTX32:#define __SIZE_WIDTH__ 32 // NVPTX32-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U +// NVPTX32:#define __UINT16_C(c) c // NVPTX32:#define __UINT16_C_SUFFIX__ // NVPTX32:#define __UINT16_MAX__ 65535 // NVPTX32:#define __UINT16_TYPE__ unsigned short +// NVPTX32:#define __UINT32_C(c) c##U // NVPTX32:#define __UINT32_C_SUFFIX__ U // NVPTX32:#define __UINT32_MAX__ 4294967295U // NVPTX32:#define __UINT32_TYPE__ unsigned int +// NVPTX32:#define __UINT64_C(c) c##ULL // NVPTX32:#define __UINT64_C_SUFFIX__ ULL // NVPTX32:#define __UINT64_MAX__ 18446744073709551615ULL // NVPTX32:#define __UINT64_TYPE__ long long unsigned int +// NVPTX32:#define __UINT8_C(c) c // NVPTX32:#define __UINT8_C_SUFFIX__ // NVPTX32:#define __UINT8_MAX__ 255 // NVPTX32:#define __UINT8_TYPE__ unsigned char +// NVPTX32:#define __UINTMAX_C(c) c##ULL // NVPTX32:#define __UINTMAX_C_SUFFIX__ ULL // NVPTX32:#define __UINTMAX_MAX__ 18446744073709551615ULL // NVPTX32:#define __UINTMAX_TYPE__ long long unsigned int @@ -801,26 +821,31 @@ // NVPTX64:#define __FLT_MIN_EXP__ (-125) // NVPTX64:#define __FLT_MIN__ 1.17549435e-38F // NVPTX64:#define __FLT_RADIX__ 2 +// NVPTX64:#define __INT16_C(c) c // NVPTX64:#define __INT16_C_SUFFIX__ // NVPTX64:#define __INT16_FMTd__ "hd" // NVPTX64:#define __INT16_FMTi__ "hi" // NVPTX64:#define __INT16_MAX__ 32767 // NVPTX64:#define __INT16_TYPE__ short +// NVPTX64:#define __INT32_C(c) c // NVPTX64:#define __INT32_C_SUFFIX__ // NVPTX64:#define __INT32_FMTd__ "d" // NVPTX64:#define __INT32_FMTi__ "i" // NVPTX64:#define __INT32_MAX__ 2147483647 // NVPTX64:#define __INT32_TYPE__ int +// NVPTX64:#define __INT64_C(c) c##LL // NVPTX64:#define __INT64_C_SUFFIX__ LL // NVPTX64:#define __INT64_FMTd__ "lld" // NVPTX64:#define __INT64_FMTi__ "lli" // NVPTX64:#define __INT64_MAX__ 9223372036854775807LL // NVPTX64:#define __INT64_TYPE__ long long int +// NVPTX64:#define __INT8_C(c) c // NVPTX64:#define __INT8_C_SUFFIX__ // NVPTX64:#define __INT8_FMTd__ "hhd" // NVPTX64:#define __INT8_FMTi__ "hhi" // NVPTX64:#define __INT8_MAX__ 127 // NVPTX64:#define __INT8_TYPE__ signed char +// NVPTX64:#define __INTMAX_C(c) c##LL // NVPTX64:#define __INTMAX_C_SUFFIX__ LL // NVPTX64:#define __INTMAX_FMTd__ "lld" // NVPTX64:#define __INTMAX_FMTi__ "lli" @@ -908,18 +933,23 @@ // NVPTX64:#define __SIZE_TYPE__ long unsigned int // NVPTX64:#define __SIZE_WIDTH__ 64 // NVPTX64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8UL +// NVPTX64:#define __UINT16_C(c) c // NVPTX64:#define __UINT16_C_SUFFIX__ // NVPTX64:#define __UINT16_MAX__ 65535 // NVPTX64:#define __UINT16_TYPE__ unsigned short +// NVPTX64:#define __UINT32_C(c) c##U // NVPTX64:#define __UINT32_C_SUFFIX__ U // NVPTX64:#define __UINT32_MAX__ 4294967295U // NVPTX64:#define __UINT32_TYPE__ unsigned int +// NVPTX64:#define __UINT64_C(c) c##ULL // NVPTX64:#define __UINT64_C_SUFFIX__ ULL // NVPTX64:#define __UINT64_MAX__ 18446744073709551615ULL // NVPTX64:#define __UINT64_TYPE__ long long unsigned int +// NVPTX64:#define __UINT8_C(c) c // NVPTX64:#define __UINT8_C_SUFFIX__ // NVPTX64:#define __UINT8_MAX__ 255 // NVPTX64:#define __UINT8_TYPE__ unsigned char +// NVPTX64:#define __UINTMAX_C(c) c##ULL // NVPTX64:#define __UINTMAX_C_SUFFIX__ ULL // NVPTX64:#define __UINTMAX_MAX__ 18446744073709551615ULL // NVPTX64:#define __UINTMAX_TYPE__ long long unsigned int @@ -1003,26 +1033,31 @@ // SPARC:#define __FLT_MIN__ 1.17549435e-38F // SPARC:#define __FLT_RADIX__ 2 // SPARC:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1 +// SPARC:#define __INT16_C(c) c // SPARC:#define __INT16_C_SUFFIX__ // SPARC:#define __INT16_FMTd__ "hd" // SPARC:#define __INT16_FMTi__ "hi" // SPARC:#define __INT16_MAX__ 32767 // SPARC:#define __INT16_TYPE__ short +// SPARC:#define __INT32_C(c) c // SPARC:#define __INT32_C_SUFFIX__ // SPARC:#define __INT32_FMTd__ "d" // SPARC:#define __INT32_FMTi__ "i" // SPARC:#define __INT32_MAX__ 2147483647 // SPARC:#define __INT32_TYPE__ int +// SPARC:#define __INT64_C(c) c##LL // SPARC:#define __INT64_C_SUFFIX__ LL // SPARC:#define __INT64_FMTd__ "lld" // SPARC:#define __INT64_FMTi__ "lli" // SPARC:#define __INT64_MAX__ 9223372036854775807LL // SPARC:#define __INT64_TYPE__ long long int +// SPARC:#define __INT8_C(c) c // SPARC:#define __INT8_C_SUFFIX__ // SPARC:#define __INT8_FMTd__ "hhd" // SPARC:#define __INT8_FMTi__ "hhi" // SPARC:#define __INT8_MAX__ 127 // SPARC:#define __INT8_TYPE__ signed char +// SPARC:#define __INTMAX_C(c) c##LL // SPARC:#define __INTMAX_C_SUFFIX__ LL // SPARC:#define __INTMAX_FMTd__ "lld" // SPARC:#define __INTMAX_FMTi__ "lli" @@ -1114,18 +1149,23 @@ // SPARC-NETOPENBSD:#define __SIZE_TYPE__ long unsigned int // SPARC:#define __SIZE_WIDTH__ 32 // SPARC-DEFAULT-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U +// SPARC:#define __UINT16_C(c) c // SPARC:#define __UINT16_C_SUFFIX__ // SPARC:#define __UINT16_MAX__ 65535 // SPARC:#define __UINT16_TYPE__ unsigned short +// SPARC:#define __UINT32_C(c) c##U // SPARC:#define __UINT32_C_SUFFIX__ U // SPARC:#define __UINT32_MAX__ 4294967295U // SPARC:#define __UINT32_TYPE__ unsigned int +// SPARC:#define __UINT64_C(c) c##ULL // SPARC:#define __UINT64_C_SUFFIX__ ULL // SPARC:#define __UINT64_MAX__ 18446744073709551615ULL // SPARC:#define __UINT64_TYPE__ long long unsigned int +// SPARC:#define __UINT8_C(c) c // SPARC:#define __UINT8_C_SUFFIX__ // SPARC:#define __UINT8_MAX__ 255 // SPARC:#define __UINT8_TYPE__ unsigned char +// SPARC:#define __UINTMAX_C(c) c##ULL // SPARC:#define __UINTMAX_C_SUFFIX__ ULL // SPARC:#define __UINTMAX_MAX__ 18446744073709551615ULL // SPARC:#define __UINTMAX_TYPE__ long long unsigned int @@ -1201,21 +1241,25 @@ // TCE:#define __FLT_MIN_EXP__ (-125) // TCE:#define __FLT_MIN__ 1.17549435e-38F // TCE:#define __FLT_RADIX__ 2 +// TCE:#define __INT16_C(c) c // TCE:#define __INT16_C_SUFFIX__ // TCE:#define __INT16_FMTd__ "hd" // TCE:#define __INT16_FMTi__ "hi" // TCE:#define __INT16_MAX__ 32767 // TCE:#define __INT16_TYPE__ short +// TCE:#define __INT32_C(c) c // TCE:#define __INT32_C_SUFFIX__ // TCE:#define __INT32_FMTd__ "d" // TCE:#define __INT32_FMTi__ "i" // TCE:#define __INT32_MAX__ 2147483647 // TCE:#define __INT32_TYPE__ int +// TCE:#define __INT8_C(c) c // TCE:#define __INT8_C_SUFFIX__ // TCE:#define __INT8_FMTd__ "hhd" // TCE:#define __INT8_FMTi__ "hhi" // TCE:#define __INT8_MAX__ 127 // TCE:#define __INT8_TYPE__ signed char +// TCE:#define __INTMAX_C(c) c##L // TCE:#define __INTMAX_C_SUFFIX__ L // TCE:#define __INTMAX_FMTd__ "ld" // TCE:#define __INTMAX_FMTi__ "li" @@ -1293,15 +1337,19 @@ // TCE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 4U // TCE:#define __TCE_V1__ 1 // TCE:#define __TCE__ 1 +// TCE:#define __UINT16_C(c) c // TCE:#define __UINT16_C_SUFFIX__ // TCE:#define __UINT16_MAX__ 65535 // TCE:#define __UINT16_TYPE__ unsigned short +// TCE:#define __UINT32_C(c) c##U // TCE:#define __UINT32_C_SUFFIX__ U // TCE:#define __UINT32_MAX__ 4294967295U // TCE:#define __UINT32_TYPE__ unsigned int +// TCE:#define __UINT8_C(c) c // TCE:#define __UINT8_C_SUFFIX__ // TCE:#define __UINT8_MAX__ 255 // TCE:#define __UINT8_TYPE__ unsigned char +// TCE:#define __UINTMAX_C(c) c##UL // TCE:#define __UINTMAX_C_SUFFIX__ UL // TCE:#define __UINTMAX_MAX__ 4294967295UL // TCE:#define __UINTMAX_TYPE__ long unsigned int @@ -1373,6 +1421,7 @@ // PS4:#define __FreeBSD_cc_version 900001 // PS4:#define __INT16_TYPE__ short // PS4:#define __INT32_TYPE__ int +// PS4:#define __INT64_C(c) c##L // PS4:#define __INT64_C_SUFFIX__ L // PS4:#define __INT64_TYPE__ long int // PS4:#define __INT8_TYPE__ signed char @@ -1464,6 +1513,7 @@ // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix SPARCV9 %s // SPARCV9:#define __BIGGEST_ALIGNMENT__ 16 // SPARCV9:#define __INT64_TYPE__ long int +// SPARCV9:#define __INTMAX_C(c) c##L // SPARCV9:#define __INTMAX_C_SUFFIX__ L // SPARCV9:#define __INTMAX_TYPE__ long int // SPARCV9:#define __INTPTR_TYPE__ long int @@ -1475,8 +1525,10 @@ // // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC64-OBSD %s // SPARC64-OBSD:#define __INT64_TYPE__ long long int +// SPARC64-OBSD:#define __INTMAX_C(c) c##LL // SPARC64-OBSD:#define __INTMAX_C_SUFFIX__ LL // SPARC64-OBSD:#define __INTMAX_TYPE__ long long int +// SPARC64-OBSD:#define __UINTMAX_C(c) c##ULL // SPARC64-OBSD:#define __UINTMAX_C_SUFFIX__ ULL // SPARC64-OBSD:#define __UINTMAX_TYPE__ long long unsigned int // @@ -1720,26 +1772,31 @@ // WEBASSEMBLY-NEXT:#define __GXX_ABI_VERSION 1002 // WEBASSEMBLY32-NEXT:#define __ILP32__ 1 // WEBASSEMBLY64-NOT:#define __ILP32__ +// WEBASSEMBLY-NEXT:#define __INT16_C(c) c // WEBASSEMBLY-NEXT:#define __INT16_C_SUFFIX__ // WEBASSEMBLY-NEXT:#define __INT16_FMTd__ "hd" // WEBASSEMBLY-NEXT:#define __INT16_FMTi__ "hi" // WEBASSEMBLY-NEXT:#define __INT16_MAX__ 32767 // WEBASSEMBLY-NEXT:#define __INT16_TYPE__ short +// WEBASSEMBLY-NEXT:#define __INT32_C(c) c // WEBASSEMBLY-NEXT:#define __INT32_C_SUFFIX__ // WEBASSEMBLY-NEXT:#define __INT32_FMTd__ "d" // WEBASSEMBLY-NEXT:#define __INT32_FMTi__ "i" // WEBASSEMBLY-NEXT:#define __INT32_MAX__ 2147483647 // WEBASSEMBLY-NEXT:#define __INT32_TYPE__ int +// WEBASSEMBLY-NEXT:#define __INT64_C(c) c##LL // WEBASSEMBLY-NEXT:#define __INT64_C_SUFFIX__ LL // WEBASSEMBLY-NEXT:#define __INT64_FMTd__ "lld" // WEBASSEMBLY-NEXT:#define __INT64_FMTi__ "lli" // WEBASSEMBLY-NEXT:#define __INT64_MAX__ 9223372036854775807LL // WEBASSEMBLY-NEXT:#define __INT64_TYPE__ long long int +// WEBASSEMBLY-NEXT:#define __INT8_C(c) c // WEBASSEMBLY-NEXT:#define __INT8_C_SUFFIX__ // WEBASSEMBLY-NEXT:#define __INT8_FMTd__ "hhd" // WEBASSEMBLY-NEXT:#define __INT8_FMTi__ "hhi" // WEBASSEMBLY-NEXT:#define __INT8_MAX__ 127 // WEBASSEMBLY-NEXT:#define __INT8_TYPE__ signed char +// WEBASSEMBLY-NEXT:#define __INTMAX_C(c) c##LL // WEBASSEMBLY-NEXT:#define __INTMAX_C_SUFFIX__ LL // WEBASSEMBLY-NEXT:#define __INTMAX_FMTd__ "lld" // WEBASSEMBLY-NEXT:#define __INTMAX_FMTi__ "lli" @@ -1892,6 +1949,7 @@ // WEBASSEMBLY-NEXT:#define __STDC_UTF_32__ 1 // WEBASSEMBLY-NEXT:#define __STDC_VERSION__ 201710L // WEBASSEMBLY-NEXT:#define __STDC__ 1 +// WEBASSEMBLY-NEXT:#define __UINT16_C(c) c // WEBASSEMBLY-NEXT:#define __UINT16_C_SUFFIX__ // WEBASSEMBLY-NEXT:#define __UINT16_FMTX__ "hX" // WEBASSEMBLY-NEXT:#define __UINT16_FMTo__ "ho" @@ -1899,6 +1957,7 @@ // WEBASSEMBLY-NEXT:#define __UINT16_FMTx__ "hx" // WEBASSEMBLY-NEXT:#define __UINT16_MAX__ 65535 // WEBASSEMBLY-NEXT:#define __UINT16_TYPE__ unsigned short +// WEBASSEMBLY-NEXT:#define __UINT32_C(c) c##U // WEBASSEMBLY-NEXT:#define __UINT32_C_SUFFIX__ U // WEBASSEMBLY-NEXT:#define __UINT32_FMTX__ "X" // WEBASSEMBLY-NEXT:#define __UINT32_FMTo__ "o" @@ -1906,6 +1965,7 @@ // WEBASSEMBLY-NEXT:#define __UINT32_FMTx__ "x" // WEBASSEMBLY-NEXT:#define __UINT32_MAX__ 4294967295U // WEBASSEMBLY-NEXT:#define __UINT32_TYPE__ unsigned int +// WEBASSEMBLY-NEXT:#define __UINT64_C(c) c##ULL // WEBASSEMBLY-NEXT:#define __UINT64_C_SUFFIX__ ULL // WEBASSEMBLY-NEXT:#define __UINT64_FMTX__ "llX" // WEBASSEMBLY-NEXT:#define __UINT64_FMTo__ "llo" @@ -1913,6 +1973,7 @@ // WEBASSEMBLY-NEXT:#define __UINT64_FMTx__ "llx" // WEBASSEMBLY-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL // WEBASSEMBLY-NEXT:#define __UINT64_TYPE__ long long unsigned int +// WEBASSEMBLY-NEXT:#define __UINT8_C(c) c // WEBASSEMBLY-NEXT:#define __UINT8_C_SUFFIX__ // WEBASSEMBLY-NEXT:#define __UINT8_FMTX__ "hhX" // WEBASSEMBLY-NEXT:#define __UINT8_FMTo__ "hho" @@ -1920,6 +1981,7 @@ // WEBASSEMBLY-NEXT:#define __UINT8_FMTx__ "hhx" // WEBASSEMBLY-NEXT:#define __UINT8_MAX__ 255 // WEBASSEMBLY-NEXT:#define __UINT8_TYPE__ unsigned char +// WEBASSEMBLY-NEXT:#define __UINTMAX_C(c) c##ULL // WEBASSEMBLY-NEXT:#define __UINTMAX_C_SUFFIX__ ULL // WEBASSEMBLY-NEXT:#define __UINTMAX_FMTX__ "llX" // WEBASSEMBLY-NEXT:#define __UINTMAX_FMTo__ "llo" @@ -2092,18 +2154,23 @@ // AVR:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 // AVR:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 1 // AVR:#define __GXX_ABI_VERSION 1002 +// AVR:#define __INT16_C(c) c // AVR:#define __INT16_C_SUFFIX__ // AVR:#define __INT16_MAX__ 32767 // AVR:#define __INT16_TYPE__ int +// AVR:#define __INT32_C(c) c##L // AVR:#define __INT32_C_SUFFIX__ L // AVR:#define __INT32_MAX__ 2147483647L // AVR:#define __INT32_TYPE__ long int +// AVR:#define __INT64_C(c) c##LL // AVR:#define __INT64_C_SUFFIX__ LL // AVR:#define __INT64_MAX__ 9223372036854775807LL // AVR:#define __INT64_TYPE__ long long int +// AVR:#define __INT8_C(c) c // AVR:#define __INT8_C_SUFFIX__ // AVR:#define __INT8_MAX__ 127 // AVR:#define __INT8_TYPE__ signed char +// AVR:#define __INTMAX_C(c) c##LL // AVR:#define __INTMAX_C_SUFFIX__ LL // AVR:#define __INTMAX_MAX__ 9223372036854775807LL // AVR:#define __INTMAX_TYPE__ long long int @@ -2175,15 +2242,19 @@ // AVR:#define __STDC__ 1 // AVR:#define __UINT16_MAX__ 65535U // AVR:#define __UINT16_TYPE__ unsigned int +// AVR:#define __UINT32_C(c) c##UL // AVR:#define __UINT32_C_SUFFIX__ UL // AVR:#define __UINT32_MAX__ 4294967295UL // AVR:#define __UINT32_TYPE__ long unsigned int +// AVR:#define __UINT64_C(c) c##ULL // AVR:#define __UINT64_C_SUFFIX__ ULL // AVR:#define __UINT64_MAX__ 18446744073709551615ULL // AVR:#define __UINT64_TYPE__ long long unsigned int +// AVR:#define __UINT8_C(c) c // AVR:#define __UINT8_C_SUFFIX__ // AVR:#define __UINT8_MAX__ 255 // AVR:#define __UINT8_TYPE__ unsigned char +// AVR:#define __UINTMAX_C(c) c##ULL // AVR:#define __UINTMAX_C_SUFFIX__ ULL // AVR:#define __UINTMAX_MAX__ 18446744073709551615ULL // AVR:#define __UINTMAX_TYPE__ long long unsigned int @@ -2383,18 +2454,23 @@ // RISCV32: #define __GNUC__ {{.*}} // RISCV32: #define __GXX_ABI_VERSION {{.*}} // RISCV32: #define __ILP32__ 1 +// RISCV32: #define __INT16_C(c) c // RISCV32: #define __INT16_C_SUFFIX__ // RISCV32: #define __INT16_MAX__ 32767 // RISCV32: #define __INT16_TYPE__ short +// RISCV32: #define __INT32_C(c) c // RISCV32: #define __INT32_C_SUFFIX__ // RISCV32: #define __INT32_MAX__ 2147483647 // RISCV32: #define __INT32_TYPE__ int +// RISCV32: #define __INT64_C(c) c##LL // RISCV32: #define __INT64_C_SUFFIX__ LL // RISCV32: #define __INT64_MAX__ 9223372036854775807LL // RISCV32: #define __INT64_TYPE__ long long int +// RISCV32: #define __INT8_C(c) c // RISCV32: #define __INT8_C_SUFFIX__ // RISCV32: #define __INT8_MAX__ 127 // RISCV32: #define __INT8_TYPE__ signed char +// RISCV32: #define __INTMAX_C(c) c##LL // RISCV32: #define __INTMAX_C_SUFFIX__ LL // RISCV32: #define __INTMAX_MAX__ 9223372036854775807LL // RISCV32: #define __INTMAX_TYPE__ long long int @@ -2474,18 +2550,23 @@ // RISCV32: #define __STDC_UTF_32__ 1 // RISCV32: #define __STDC_VERSION__ 201710L // RISCV32: #define __STDC__ 1 +// RISCV32: #define __UINT16_C(c) c // RISCV32: #define __UINT16_C_SUFFIX__ // RISCV32: #define __UINT16_MAX__ 65535 // RISCV32: #define __UINT16_TYPE__ unsigned short +// RISCV32: #define __UINT32_C(c) c##U // RISCV32: #define __UINT32_C_SUFFIX__ U // RISCV32: #define __UINT32_MAX__ 4294967295U // RISCV32: #define __UINT32_TYPE__ unsigned int +// RISCV32: #define __UINT64_C(c) c##ULL // RISCV32: #define __UINT64_C_SUFFIX__ ULL // RISCV32: #define __UINT64_MAX__ 18446744073709551615ULL // RISCV32: #define __UINT64_TYPE__ long long unsigned int +// RISCV32: #define __UINT8_C(c) c // RISCV32: #define __UINT8_C_SUFFIX__ // RISCV32: #define __UINT8_MAX__ 255 // RISCV32: #define __UINT8_TYPE__ unsigned char +// RISCV32: #define __UINTMAX_C(c) c##ULL // RISCV32: #define __UINTMAX_C_SUFFIX__ ULL // RISCV32: #define __UINTMAX_MAX__ 18446744073709551615ULL // RISCV32: #define __UINTMAX_TYPE__ long long unsigned int @@ -2596,18 +2677,23 @@ // RISCV64: #define __GNUC_STDC_INLINE__ 1 // RISCV64: #define __GNUC__ {{.*}} // RISCV64: #define __GXX_ABI_VERSION {{.*}} +// RISCV64: #define __INT16_C(c) c // RISCV64: #define __INT16_C_SUFFIX__ // RISCV64: #define __INT16_MAX__ 32767 // RISCV64: #define __INT16_TYPE__ short +// RISCV64: #define __INT32_C(c) c // RISCV64: #define __INT32_C_SUFFIX__ // RISCV64: #define __INT32_MAX__ 2147483647 // RISCV64: #define __INT32_TYPE__ int +// RISCV64: #define __INT64_C(c) c##L // RISCV64: #define __INT64_C_SUFFIX__ L // RISCV64: #define __INT64_MAX__ 9223372036854775807L // RISCV64: #define __INT64_TYPE__ long int +// RISCV64: #define __INT8_C(c) c // RISCV64: #define __INT8_C_SUFFIX__ // RISCV64: #define __INT8_MAX__ 127 // RISCV64: #define __INT8_TYPE__ signed char +// RISCV64: #define __INTMAX_C(c) c##L // RISCV64: #define __INTMAX_C_SUFFIX__ L // RISCV64: #define __INTMAX_MAX__ 9223372036854775807L // RISCV64: #define __INTMAX_TYPE__ long int @@ -2687,18 +2773,23 @@ // RISCV64: #define __STDC_UTF_32__ 1 // RISCV64: #define __STDC_VERSION__ 201710L // RISCV64: #define __STDC__ 1 +// RISCV64: #define __UINT16_C(c) c // RISCV64: #define __UINT16_C_SUFFIX__ // RISCV64: #define __UINT16_MAX__ 65535 // RISCV64: #define __UINT16_TYPE__ unsigned short +// RISCV64: #define __UINT32_C(c) c##U // RISCV64: #define __UINT32_C_SUFFIX__ U // RISCV64: #define __UINT32_MAX__ 4294967295U // RISCV64: #define __UINT32_TYPE__ unsigned int +// RISCV64: #define __UINT64_C(c) c##UL // RISCV64: #define __UINT64_C_SUFFIX__ UL // RISCV64: #define __UINT64_MAX__ 18446744073709551615UL // RISCV64: #define __UINT64_TYPE__ long unsigned int +// RISCV64: #define __UINT8_C(c) c // RISCV64: #define __UINT8_C_SUFFIX__ // RISCV64: #define __UINT8_MAX__ 255 // RISCV64: #define __UINT8_TYPE__ unsigned char +// RISCV64: #define __UINTMAX_C(c) c##UL // RISCV64: #define __UINTMAX_C_SUFFIX__ UL // RISCV64: #define __UINTMAX_MAX__ 18446744073709551615UL // RISCV64: #define __UINTMAX_TYPE__ long unsigned int @@ -2837,18 +2928,23 @@ // XTENSA: #define __GNUC__ {{.*}} // XTENSA: #define __GXX_ABI_VERSION {{.*}} // XTENSA: #define __ILP32__ 1 +// XTENSA: #define __INT16_C(c) c // XTENSA: #define __INT16_C_SUFFIX__ // XTENSA: #define __INT16_MAX__ 32767 // XTENSA: #define __INT16_TYPE__ short +// XTENSA: #define __INT32_C(c) c // XTENSA: #define __INT32_C_SUFFIX__ // XTENSA: #define __INT32_MAX__ 2147483647 // XTENSA: #define __INT32_TYPE__ int +// XTENSA: #define __INT64_C(c) c##LL // XTENSA: #define __INT64_C_SUFFIX__ LL // XTENSA: #define __INT64_MAX__ 9223372036854775807LL // XTENSA: #define __INT64_TYPE__ long long int +// XTENSA: #define __INT8_C(c) c // XTENSA: #define __INT8_C_SUFFIX__ // XTENSA: #define __INT8_MAX__ 127 // XTENSA: #define __INT8_TYPE__ signed char +// XTENSA: #define __INTMAX_C(c) c##LL // XTENSA: #define __INTMAX_C_SUFFIX__ LL // XTENSA: #define __INTMAX_MAX__ 9223372036854775807LL // XTENSA: #define __INTMAX_TYPE__ long long int @@ -2945,18 +3041,23 @@ // XTENSA: #define __STDC_UTF_32__ 1 // XTENSA: #define __STDC_VERSION__ 201710L // XTENSA: #define __STDC__ 1 +// XTENSA: #define __UINT16_C(c) c // XTENSA: #define __UINT16_C_SUFFIX__ // XTENSA: #define __UINT16_MAX__ 65535 // XTENSA: #define __UINT16_TYPE__ unsigned short +// XTENSA: #define __UINT32_C(c) c##U // XTENSA: #define __UINT32_C_SUFFIX__ U // XTENSA: #define __UINT32_MAX__ 4294967295U // XTENSA: #define __UINT32_TYPE__ unsigned int +// XTENSA: #define __UINT64_C(c) c##ULL // XTENSA: #define __UINT64_C_SUFFIX__ ULL // XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL // XTENSA: #define __UINT64_TYPE__ long long unsigned int +// XTENSA: #define __UINT8_C(c) c // XTENSA: #define __UINT8_C_SUFFIX__ // XTENSA: #define __UINT8_MAX__ 255 // XTENSA: #define __UINT8_TYPE__ unsigned char +// XTENSA: #define __UINTMAX_C(c) c##ULL // XTENSA: #define __UINTMAX_C_SUFFIX__ ULL // XTENSA: #define __UINTMAX_MAX__ 18446744073709551615ULL // XTENSA: #define __UINTMAX_TYPE__ long long unsigned int From 2af819fa3d802e55027dcc1408186cb8738f08e6 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Sun, 26 Jan 2025 07:53:53 -0800 Subject: [PATCH 141/432] [MemProf] Add test for hot hints (#124394) The change in PR124219 required removing one of the tests added for -memprof-use-hot-hints, since we no longer label any contexts as hot in metadata, so add a new test that checks the hot attribute instead. --- llvm/test/Transforms/PGOProfile/memprof.ll | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll index 6aa2d307a1dc8..acf70880becd1 100644 --- a/llvm/test/Transforms/PGOProfile/memprof.ll +++ b/llvm/test/Transforms/PGOProfile/memprof.ll @@ -85,6 +85,14 @@ ; RAND2: random hotness seed = 1730170724 ; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2,ALL,MEMPROFONLY,MEMPROFSTATS +;; With the hot access density threshold set to 0, and hot hints enabled, +;; the unconditionally notcold call to new should instead get a hot attribute. +; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -memprof-print-match-info -stats -memprof-min-ave-lifetime-access-density-hot-threshold=0 -memprof-use-hot-hints 2>&1 | FileCheck %s --check-prefixes=MEMPROFHOT,ALL + +;; However, with the same threshold, but hot hints not enabled, it should be +;; notcold again. +; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -memprof-min-ave-lifetime-access-density-hot-threshold=0 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL + ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched ; MEMPROFMATCHINFO: MemProf notcold context with id 6792096022461663180 has total profiled size 10 is matched @@ -192,6 +200,7 @@ entry: store ptr %argv, ptr %argv.addr, align 8 ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]] ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]] + ; MEMPROFHOT: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]] %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !35 store ptr %call, ptr %a, align 8, !dbg !36 ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A2:[0-9]+]] @@ -404,6 +413,8 @@ for.end: ; preds = %for.cond ; MEMPROFNOCOLINFO: ![[C10]] = !{i64 -4535090212904553409} ; MEMPROFNOCOLINFO: ![[C11]] = !{i64 3577763375057267810} +; MEMPROFHOT: #[[A1]] = { builtin allocsize(0) "memprof"="hot" } + ;; For the specific random seed, this is the expected order of hotness ; MEMPROFRAND2: !"cold" ; MEMPROFRAND2: !"cold" From f8ab91f74f152c8a6d8aaedb8165109c497a618d Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sun, 26 Jan 2025 16:53:43 +0100 Subject: [PATCH 142/432] [LVI][CVP] Add test for trunc bittest. (NFC) --- .../CorrelatedValuePropagation/icmp.ll | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll index 72f09a949a060..e4de34c339d2d 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll @@ -1509,3 +1509,107 @@ end: ; %arg is within [-16, -8). ret void } + +define void @test_trunc_bittest(i8 %a) { +; CHECK-LABEL: @test_trunc_bittest( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[TRUNC]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]] +; CHECK: if.true: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[A]], 0 +; CHECK-NEXT: call void @check1(i1 [[CMP1]]) +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: call void @check1(i1 [[CMP2]]) +; CHECK-NEXT: ret void +; CHECK: if.false: +; CHECK-NEXT: ret void +; + %trunc = trunc i8 %a to i1 + br i1 %trunc, label %if.true, label %if.false + +if.true: + %cmp1 = icmp ne i8 %a, 0 + call void @check1(i1 %cmp1) + %cmp2 = icmp eq i8 %a, 0 + call void @check1(i1 %cmp2) + ret void + +if.false: + ret void +} + +define void @test_trunc_not_bittest(i8 %a) { +; CHECK-LABEL: @test_trunc_not_bittest( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[A:%.*]] to i1 +; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[TRUNC]], true +; CHECK-NEXT: br i1 [[NOT]], label [[IF_FALSE:%.*]], label [[IF_TRUE:%.*]] +; CHECK: if.true: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[A]], -1 +; CHECK-NEXT: call void @check1(i1 [[CMP1]]) +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[A]], -1 +; CHECK-NEXT: call void @check1(i1 [[CMP2]]) +; CHECK-NEXT: ret void +; CHECK: if.false: +; CHECK-NEXT: ret void +; + %trunc = trunc i8 %a to i1 + %not = xor i1 %trunc, true + br i1 %not, label %if.true, label %if.false + +if.true: + %cmp1 = icmp ne i8 %a, -1 + call void @check1(i1 %cmp1) + %cmp2 = icmp eq i8 %a, -1 + call void @check1(i1 %cmp2) + ret void + +if.false: + ret void +} + +define void @test_icmp_trunc(i8 %a) { +; CHECK-LABEL: @test_icmp_trunc( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[A:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]] +; CHECK: if.true: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[A]] to i1 +; CHECK-NEXT: call void @check1(i1 [[TRUNC]]) +; CHECK-NEXT: ret void +; CHECK: if.false: +; CHECK-NEXT: ret void +; + %cmp1 = icmp ne i8 %a, 0 + br i1 %cmp1, label %if.true, label %if.false + +if.true: + %trunc = trunc i8 %a to i1 + call void @check1(i1 %trunc) + ret void + +if.false: + ret void +} + +define void @test_icmp_trunc_not(i8 %a) { +; CHECK-LABEL: @test_icmp_trunc_not( +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], -1 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]] +; CHECK: if.true: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[A]] to i1 +; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[TRUNC]], true +; CHECK-NEXT: call void @check1(i1 [[TRUNC]]) +; CHECK-NEXT: ret void +; CHECK: if.false: +; CHECK-NEXT: ret void +; + %cmp1 = icmp eq i8 %a, -1 + br i1 %cmp1, label %if.true, label %if.false + +if.true: + %trunc = trunc i8 %a to i1 + %not = xor i1 %trunc, true + call void @check1(i1 %trunc) + ret void + +if.false: + ret void +} From e8e75e08c9214fe25b56535fc26f5435a875a137 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 09:46:38 -0800 Subject: [PATCH 143/432] [lld-macho] Remove unneeded functions from BPSectionOrderer. NFC --- lld/MachO/BPSectionOrderer.cpp | 12 +++++------- lld/MachO/BPSectionOrderer.h | 8 +------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp index 18c8aad58344f..c2d03c968534e 100644 --- a/lld/MachO/BPSectionOrderer.cpp +++ b/lld/MachO/BPSectionOrderer.cpp @@ -26,8 +26,7 @@ DenseMap lld::macho::runBalancedPartitioning( auto *isec = subsec.isec; if (!isec || isec->data.empty() || !isec->data.data()) continue; - sections.emplace_back( - std::make_unique(isec, sections.size())); + sections.emplace_back(std::make_unique(isec)); } } } @@ -38,11 +37,10 @@ DenseMap lld::macho::runBalancedPartitioning( DenseMap result; for (const auto &[sec, priority] : reorderedSections) { - if (auto *machoSection = dyn_cast(sec)) { - result.try_emplace( - static_cast(machoSection->getSection()), - priority); - } + result.try_emplace( + static_cast( + static_cast(sec)->getSection()), + priority); } return result; } diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h index 69c6b260f044c..e3e6b12092e20 100644 --- a/lld/MachO/BPSectionOrderer.h +++ b/lld/MachO/BPSectionOrderer.h @@ -57,18 +57,14 @@ class BPSymbolMacho : public BPSymbol { class BPSectionMacho : public BPSectionBase { const InputSection *isec; - uint64_t sectionIdx; public: - explicit BPSectionMacho(const InputSection *sec, uint64_t sectionIdx) - : isec(sec), sectionIdx(sectionIdx) {} + explicit BPSectionMacho(const InputSection *sec) : isec(sec) {} const void *getSection() const override { return isec; } uint64_t getSize() const override { return isec->getSize(); } - uint64_t getSectionIdx() const { return sectionIdx; } - bool isCodeSection() const override { return macho::isCodeSection(isec); } SmallVector> getSymbols() const override { @@ -118,8 +114,6 @@ class BPSectionMacho : public BPSectionBase { hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end()); } - static bool classof(const BPSectionBase *s) { return true; } - private: static uint64_t getRelocHash(const Reloc &reloc, From ccc066e8d5a742f79b41a0f90ef309d5b9e92c2a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 26 Jan 2025 11:50:10 -0800 Subject: [PATCH 144/432] [TableGen] Avoid repeated map lookups (NFC) (#124448) This patch avoids repeated map lookups and constructions of temporary std::string instances by switching to DenseSet. --- clang/utils/TableGen/MveEmitter.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index 8ebd0bb800fef..58a4d3c22ac36 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1955,18 +1955,17 @@ void MveEmitter::EmitBuiltinDef(raw_ostream &OS) { << ", \"\", \"n\")\n"; } - std::set ShortNamesSeen; + DenseSet ShortNamesSeen; for (const auto &kv : ACLEIntrinsics) { const ACLEIntrinsic &Int = *kv.second; if (Int.polymorphic()) { StringRef Name = Int.shortName(); - if (ShortNamesSeen.find(std::string(Name)) == ShortNamesSeen.end()) { + if (ShortNamesSeen.insert(Name).second) { OS << "BUILTIN(__builtin_arm_mve_" << Name << ", \"vi.\", \"nt"; if (Int.nonEvaluating()) OS << "u"; // indicate that this builtin doesn't evaluate its args OS << "\")\n"; - ShortNamesSeen.insert(std::string(Name)); } } } From 1c4341d176492da5f276937b84a3d0c959e4cf5b Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Sat, 25 Jan 2025 09:46:42 -0800 Subject: [PATCH 145/432] [SandboxVec][DAG] Fix interval check without Node This patch moves the check of whether a node exists before the check of whether it is contained in the interval. --- .../Vectorize/SandboxVectorizer/DependencyGraph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 390a5e9688cc7..7aa8794d26b20 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -267,11 +267,11 @@ void DependencyGraph::setDefUseUnscheduledSuccs( auto *OpI = dyn_cast(Op); if (OpI == nullptr) continue; - if (!TopInterval.contains(OpI)) - continue; auto *OpN = getNode(OpI); if (OpN == nullptr) continue; + if (!TopInterval.contains(OpI)) + continue; ++OpN->UnscheduledSuccs; } } From fb01a289038c16e13c6133ee602a58254b349411 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Sun, 26 Jan 2025 22:11:40 +0100 Subject: [PATCH 146/432] [LLD][COFF] Implement support for hybrid IAT on ARM64X (#124189) In hybrid images, the PE header references a single IAT for both native and EC views, merging entries where possible. When merging isn't feasible, different imports are grouped together, and ARM64X relocations are emitted as needed. --- lld/COFF/Chunks.cpp | 27 +- lld/COFF/DLL.cpp | 146 ++++++++- lld/COFF/InputFiles.cpp | 14 +- lld/COFF/InputFiles.h | 6 +- lld/test/COFF/arm64x-import.test | 533 +++++++++++++++++++++++++++++++ 5 files changed, 706 insertions(+), 20 deletions(-) create mode 100644 lld/test/COFF/arm64x-import.test diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 11e7cf4346b23..a01c69c709876 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1172,11 +1172,12 @@ uint64_t Arm64XRelocVal::get() const { size_t Arm64XDynamicRelocEntry::getSize() const { switch (type) { + case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL: + return sizeof(uint16_t); // Just a header. case IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE: return sizeof(uint16_t) + size; // A header and a payload. case IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA: - case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL: - llvm_unreachable("unsupported type"); + return 2 * sizeof(uint16_t); // A header and a delta. } llvm_unreachable("invalid type"); } @@ -1186,6 +1187,9 @@ void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const { *out = (offset.get() & 0xfff) | (type << 12); switch (type) { + case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL: + *out |= ((bit_width(size) - 1) << 14); // Encode the size. + break; case IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE: *out |= ((bit_width(size) - 1) << 14); // Encode the size. switch (size) { @@ -1203,8 +1207,23 @@ void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const { } break; case IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA: - case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL: - llvm_unreachable("unsupported type"); + int delta = value.get(); + // Negative offsets use a sign bit in the header. + if (delta < 0) { + *out |= 1 << 14; + delta = -delta; + } + // Depending on the value, the delta is encoded with a shift of 2 or 3 bits. + if (delta & 7) { + assert(!(delta & 3)); + delta >>= 2; + } else { + *out |= (1 << 15); + delta >>= 3; + } + out[1] = delta; + assert(!(delta & ~0xffff)); + break; } } diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp index ae3a8047b7008..b6fbd5a484b5e 100644 --- a/lld/COFF/DLL.cpp +++ b/lld/COFF/DLL.cpp @@ -716,6 +716,63 @@ class ExportOrdinalChunk : public NonSectionChunk { void IdataContents::create(COFFLinkerContext &ctx) { std::vector> v = binImports(ctx, imports); + // In hybrid images, EC and native code are usually very similar, + // resulting in a highly similar set of imported symbols. Consequently, + // their import tables can be shared, with ARM64X relocations handling any + // differences. Identify matching import files used by EC and native code, and + // merge them into a single hybrid import entry. + if (ctx.hybridSymtab) { + for (std::vector &syms : v) { + std::vector hybridSyms; + ImportFile *prev = nullptr; + for (DefinedImportData *sym : syms) { + ImportFile *file = sym->file; + // At this stage, symbols are sorted by base name, ensuring that + // compatible import files, if present, are adjacent. Check if the + // current symbol's file imports the same symbol as the previously added + // one (if any and if it was not already merged). Additionally, verify + // that one of them is native while the other is EC. In rare cases, + // separate matching import entries may exist within the same namespace, + // which cannot be merged. + if (!prev || file->isEC() == prev->isEC() || + !file->isSameImport(prev)) { + // We can't merge the import file, just add it to hybridSyms + // and set prev to its file so that we can try to match the next + // symbol. + hybridSyms.push_back(sym); + prev = file; + continue; + } + + // A matching symbol may appear in syms in any order. The native variant + // exposes a subset of EC symbols and chunks, so always use the EC + // variant as the hybrid import file. If the native file was already + // added, replace it with the EC symbol in hybridSyms. Otherwise, the EC + // variant is already pushed, so we can simply merge it. + if (file->isEC()) { + hybridSyms.pop_back(); + hybridSyms.push_back(sym); + } + + // Merge import files by storing their hybrid form in the corresponding + // file class. + prev->hybridFile = file; + file->hybridFile = prev; + prev = nullptr; // A hybrid import file cannot be merged again. + } + + // Sort symbols by type: native-only files first, followed by merged + // hybrid files, and then EC-only files. + llvm::stable_sort(hybridSyms, + [](DefinedImportData *a, DefinedImportData *b) { + if (a->file->hybridFile) + return !b->file->hybridFile && b->file->isEC(); + return !a->file->isEC() && b->file->isEC(); + }); + syms = std::move(hybridSyms); + } + } + // Create .idata contents for each DLL. for (std::vector &syms : v) { // Create lookup and address tables. If they have external names, @@ -723,19 +780,56 @@ void IdataContents::create(COFFLinkerContext &ctx) { // If they don't (if they are import-by-ordinals), we store only // ordinal values to the table. size_t base = lookups.size(); + Chunk *lookupsTerminator = nullptr, *addressesTerminator = nullptr; for (DefinedImportData *s : syms) { uint16_t ord = s->getOrdinal(); + HintNameChunk *hintChunk = nullptr; + Chunk *lookupsChunk, *addressesChunk; + if (s->getExternalName().empty()) { - lookups.push_back(make(ctx, ord)); - addresses.push_back(make(ctx, ord)); + lookupsChunk = make(ctx, ord); + addressesChunk = make(ctx, ord); } else { - auto *c = make(s->getExternalName(), ord); - lookups.push_back(make(ctx, c)); - addresses.push_back(make(ctx, c)); - hints.push_back(c); + hintChunk = make(s->getExternalName(), ord); + lookupsChunk = make(ctx, hintChunk); + addressesChunk = make(ctx, hintChunk); + hints.push_back(hintChunk); } - if (s->file->impECSym) { + // Detect the first EC-only import in the hybrid IAT. Emit null chunk + // as a terminator for the native view, and add an ARM64X relocation to + // replace it with the correct import for the EC view. + // + // Additionally, for MSVC compatibility, store the lookup and address + // chunks and append them at the end of EC-only imports, where a null + // terminator chunk would typically be placed. Since they appear after + // the native terminator, they will be ignored in the native view. + // In the EC view, they should act as terminators, so emit ZEROFILL + // relocations overriding them. + if (ctx.hybridSymtab && !lookupsTerminator && s->file->isEC() && + !s->file->hybridFile) { + lookupsTerminator = lookupsChunk; + addressesTerminator = addressesChunk; + lookupsChunk = make(ctx); + addressesChunk = make(ctx); + + Arm64XRelocVal relocVal = hintChunk; + if (!hintChunk) + relocVal = (1ULL << 63) | ord; + ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, + sizeof(uint64_t), lookupsChunk, relocVal); + ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, + sizeof(uint64_t), addressesChunk, relocVal); + ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL, + sizeof(uint64_t), lookupsTerminator); + ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL, + sizeof(uint64_t), addressesTerminator); + } + + lookups.push_back(lookupsChunk); + addresses.push_back(addressesChunk); + + if (s->file->isEC()) { auto chunk = make(s->file); auxIat.push_back(chunk); s->file->impECSym->setLocation(chunk); @@ -743,18 +837,27 @@ void IdataContents::create(COFFLinkerContext &ctx) { chunk = make(s->file); auxIatCopy.push_back(chunk); s->file->auxImpCopySym->setLocation(chunk); + } else if (ctx.hybridSymtab) { + // Fill the auxiliary IAT with null chunks for native-only imports. + auxIat.push_back(make(ctx)); + auxIatCopy.push_back(make(ctx)); } } // Terminate with null values. - lookups.push_back(make(ctx)); - addresses.push_back(make(ctx)); - if (ctx.config.machine == ARM64EC) { + lookups.push_back(lookupsTerminator ? lookupsTerminator + : make(ctx)); + addresses.push_back(addressesTerminator ? addressesTerminator + : make(ctx)); + if (ctx.symtabEC) { auxIat.push_back(make(ctx)); auxIatCopy.push_back(make(ctx)); } - for (int i = 0, e = syms.size(); i < e; ++i) + for (int i = 0, e = syms.size(); i < e; ++i) { syms[i]->setLocation(addresses[base + i]); + if (syms[i]->file->hybridFile) + syms[i]->file->hybridFile->impSym->setLocation(addresses[base + i]); + } // Create the import table header. dllNames.push_back(make(syms[0]->getDLLName())); @@ -762,6 +865,27 @@ void IdataContents::create(COFFLinkerContext &ctx) { dir->lookupTab = lookups[base]; dir->addressTab = addresses[base]; dirs.push_back(dir); + + if (ctx.hybridSymtab) { + // If native-only imports exist, they will appear as a prefix to all + // imports. Emit ARM64X relocations to skip them in the EC view. + uint32_t nativeOnly = + llvm::find_if(syms, + [](DefinedImportData *s) { return s->file->isEC(); }) - + syms.begin(); + if (nativeOnly) { + ctx.dynamicRelocs->add( + IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, + Arm64XRelocVal( + dir, offsetof(ImportDirectoryTableEntry, ImportLookupTableRVA)), + nativeOnly * sizeof(uint64_t)); + ctx.dynamicRelocs->add( + IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, + Arm64XRelocVal(dir, offsetof(ImportDirectoryTableEntry, + ImportAddressTableRVA)), + nativeOnly * sizeof(uint64_t)); + } + } } // Add null terminator. dirs.push_back(make(sizeof(ImportDirectoryTableEntry), 4)); diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 47faf70e099e1..7b105fb4c17a2 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -1129,15 +1129,21 @@ void ObjFile::enqueuePdbFile(StringRef path, ObjFile *fromFile) { } ImportFile::ImportFile(COFFLinkerContext &ctx, MemoryBufferRef m) - : InputFile(ctx.symtab, ImportKind, m), live(!ctx.config.doGC) {} + : InputFile(ctx.getSymtab(getMachineType(m)), ImportKind, m), + live(!ctx.config.doGC) {} -MachineTypes ImportFile::getMachineType() const { +MachineTypes ImportFile::getMachineType(MemoryBufferRef m) { uint16_t machine = - reinterpret_cast(mb.getBufferStart()) - ->Machine; + reinterpret_cast(m.getBufferStart())->Machine; return MachineTypes(machine); } +bool ImportFile::isSameImport(const ImportFile *other) const { + if (!externalName.empty()) + return other->externalName == externalName; + return hdr->OrdinalHint == other->hdr->OrdinalHint; +} + ImportThunkChunk *ImportFile::makeImportThunk() { switch (hdr->Machine) { case AMD64: diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index 823561cda247a..21b9aeef21d4f 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -351,11 +351,15 @@ class ImportFile : public InputFile { explicit ImportFile(COFFLinkerContext &ctx, MemoryBufferRef m); static bool classof(const InputFile *f) { return f->kind() == ImportKind; } - MachineTypes getMachineType() const override; + MachineTypes getMachineType() const override { return getMachineType(mb); } + static MachineTypes getMachineType(MemoryBufferRef m); + bool isSameImport(const ImportFile *other) const; + bool isEC() const { return impECSym != nullptr; } DefinedImportData *impSym = nullptr; Defined *thunkSym = nullptr; ImportThunkChunkARM64EC *impchkThunk = nullptr; + ImportFile *hybridFile = nullptr; std::string dllName; private: diff --git a/lld/test/COFF/arm64x-import.test b/lld/test/COFF/arm64x-import.test new file mode 100644 index 0000000000000..bc202e1d17251 --- /dev/null +++ b/lld/test/COFF/arm64x-import.test @@ -0,0 +1,533 @@ +REQUIRES: aarch64 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12-thunks-arm64ec.s -o func12-thunks-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func12-thunks-arm64.s -o func12-thunks-arm64.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12-arm64ec.s -o func12-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func123-arm64.s -o func123-arm64.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func123-arm64ec.s -o func123-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func12-arm64.s -o func12-arm64.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func234-arm64.s -o func234-arm64.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12o-arm64ec.s -o func12o-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func34-arm64.s -o func34-arm64.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func34o-arm64.s -o func34o-arm64.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows funco-arm64.s -o funco-arm64.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows icall.s -o icall.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj +RUN: llvm-lib -machine:arm64ec -def:imp.def -out:imp-arm64ec.lib +RUN: llvm-lib -machine:arm64 -def:imp.def -out:imp-arm64.lib +RUN: llvm-lib -machine:arm64x -def:imp.def -defArm64Native:imp.def -out:imp-arm64x.lib +RUN: llvm-lib -machine:arm64x -def:imp-ord10.def -defArm64Native:imp.def -out:imp-ecord.lib +RUN: llvm-lib -machine:arm64x -def:imp-ord10.def -defArm64Native:imp-ord20.def -out:imp-ecord.lib +RUN: llvm-lib -machine:arm64x -def:imp2.def -defArm64Native:imp2.def -out:imp2.lib +RUN: llvm-lib -machine:arm64x -def:noname-ec.def -defArm64Native:noname-native.def -out:noname.lib +RUN: llvm-lib -machine:arm64x -def:dup-ec.def -defArm64Native:dup-native.def -out:dup.lib + + +# Link to the imported func1, func2, and func1's thunks from both native and EC code. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-12-thunks.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj imp-arm64ec.lib imp-arm64.lib + +RUN: llvm-objdump -d test-12-thunks.dll | FileCheck --check-prefix=DISASM-12T %s +DISASM-12T: 0000000180001000 <.text>: +DISASM-12T-NEXT: 180001000: f0000010 adrp x16, 0x180004000 +DISASM-12T-NEXT: 180001004: f9400610 ldr x16, [x16, #0x8] +DISASM-12T-NEXT: 180001008: d61f0200 br x16 +DISASM-12T-NEXT: ... +DISASM-12T-NEXT: 180002000: 52800040 mov w0, #0x2 // =2 +DISASM-12T-NEXT: 180002004: d65f03c0 ret +DISASM-12T-NEXT: 180002008: 90000030 adrp x16, 0x180006000 +DISASM-12T-NEXT: 18000200c: f9400210 ldr x16, [x16] +DISASM-12T-NEXT: 180002010: d61f0200 br x16 +DISASM-12T-NEXT: 180002014: d000000b adrp x11, 0x180004000 +DISASM-12T-NEXT: 180002018: f940016b ldr x11, [x11] +DISASM-12T-NEXT: 18000201c: 9000000a adrp x10, 0x180002000 <.text+0x1000> +DISASM-12T-NEXT: 180002020: 9100f14a add x10, x10, #0x3c +DISASM-12T-NEXT: 180002024: 17fffff7 b 0x180002000 <.text+0x1000> +DISASM-12T-NEXT: 180002028: d000000b adrp x11, 0x180004000 +DISASM-12T-NEXT: 18000202c: f940056b ldr x11, [x11, #0x8] +DISASM-12T-NEXT: 180002030: d0ffffea adrp x10, 0x180000000 +DISASM-12T-NEXT: 180002034: 9100014a add x10, x10, #0x0 +DISASM-12T-NEXT: 180002038: 17fffff2 b 0x180002000 <.text+0x1000> +DISASM-12T-NEXT: 18000203c: 52800060 mov w0, #0x3 // =3 +DISASM-12T-NEXT: 180002040: d65f03c0 ret +DISASM-12T-NEXT: ... +DISASM-12T-NEXT: 180003000: ff 25 fa 0f 00 00 jmpq *0xffa(%rip) # 0x180004000 + +RUN: llvm-readobj --coff-imports test-12-thunks.dll | FileCheck --check-prefix=IMPORTS-12 %s +IMPORTS-12: Import { +IMPORTS-12-NEXT: Name: test.dll +IMPORTS-12-NEXT: ImportLookupTableRVA: 0x5348 +IMPORTS-12-NEXT: ImportAddressTableRVA: 0x4000 +IMPORTS-12-NEXT: Symbol: func1 (0) +IMPORTS-12-NEXT: Symbol: func2 (0) +IMPORTS-12-NEXT: } +IMPORTS-12-NEXT: HybridObject { +IMPORTS-12: Import { +IMPORTS-12-NEXT: Name: test.dll +IMPORTS-12-NEXT: ImportLookupTableRVA: 0x5348 +IMPORTS-12-NEXT: ImportAddressTableRVA: 0x4000 +IMPORTS-12-NEXT: Symbol: func1 (0) +IMPORTS-12-NEXT: Symbol: func2 (0) +IMPORTS-12-NEXT: } +IMPORTS-12-NEXT: } + +RUN: llvm-readobj --hex-dump=.test test-12-thunks.dll | FileCheck --check-prefix=FUNC-12-THUNKS %s +FUNC-12-THUNKS: 0x180009000 00600000 00400000 00300000 08200000 +FUNC-12-THUNKS-NEXT: 0x180009010 08600000 08400000 + +RUN: llvm-readobj --hex-dump=.testa test-12-thunks.dll | FileCheck --check-prefix=FUNC-12-THUNKSA %s +FUNC-12-THUNKSA: 0x18000a000 00400000 08400000 00100000 + + +# If the ordinals of named imports don't match, use the EC value. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-12-thunks-ord.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj imp-ecord.lib +RUN: llvm-readobj --coff-imports test-12-thunks-ord.dll | FileCheck --check-prefix=IMPORTS-ORD %s + +IMPORTS-ORD: Format: COFF-ARM64X +IMPORTS-ORD-NEXT: Arch: aarch64 +IMPORTS-ORD-NEXT: AddressSize: 64bit +IMPORTS-ORD-NEXT: Import { +IMPORTS-ORD-NEXT: Name: test.dll +IMPORTS-ORD-NEXT: ImportLookupTableRVA: 0x5348 +IMPORTS-ORD-NEXT: ImportAddressTableRVA: 0x4000 +IMPORTS-ORD-NEXT: Symbol: func1 (11) +IMPORTS-ORD-NEXT: Symbol: func2 (12) +IMPORTS-ORD-NEXT: } +IMPORTS-ORD-NEXT: HybridObject { +IMPORTS-ORD-NEXT: Format: COFF-ARM64EC +IMPORTS-ORD-NEXT: Arch: aarch64 +IMPORTS-ORD-NEXT: AddressSize: 64bit +IMPORTS-ORD-NEXT: Import { +IMPORTS-ORD-NEXT: Name: test.dll +IMPORTS-ORD-NEXT: ImportLookupTableRVA: 0x5348 +IMPORTS-ORD-NEXT: ImportAddressTableRVA: 0x4000 +IMPORTS-ORD-NEXT: Symbol: func1 (11) +IMPORTS-ORD-NEXT: Symbol: func2 (12) +IMPORTS-ORD-NEXT: } +IMPORTS-ORD-NEXT: } + + +# Link to NONAME imports. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-noname.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj noname.lib +RUN: llvm-readobj --coff-imports test-noname.dll | FileCheck --check-prefix=IMPORTS-ORD2 %s + +IMPORTS-ORD2: Format: COFF-ARM64X +IMPORTS-ORD2-NEXT: Arch: aarch64 +IMPORTS-ORD2-NEXT: AddressSize: 64bit +IMPORTS-ORD2-NEXT: Import { +IMPORTS-ORD2-NEXT: Name: test.dll +IMPORTS-ORD2-NEXT: ImportLookupTableRVA: 0x5348 +IMPORTS-ORD2-NEXT: ImportAddressTableRVA: 0x4000 +IMPORTS-ORD2-NEXT: Symbol: (12) +IMPORTS-ORD2-NEXT: Symbol: (11) +IMPORTS-ORD2-NEXT: } +IMPORTS-ORD2-NEXT: HybridObject { +IMPORTS-ORD2-NEXT: Format: COFF-ARM64EC +IMPORTS-ORD2-NEXT: Arch: aarch64 +IMPORTS-ORD2-NEXT: AddressSize: 64bit +IMPORTS-ORD2-NEXT: Import { +IMPORTS-ORD2-NEXT: Name: test.dll +IMPORTS-ORD2-NEXT: ImportLookupTableRVA: 0x5350 +IMPORTS-ORD2-NEXT: ImportAddressTableRVA: 0x4008 +IMPORTS-ORD2-NEXT: Symbol: (11) +IMPORTS-ORD2-NEXT: Symbol: (10) +IMPORTS-ORD2-NEXT: } +IMPORTS-ORD2-NEXT: } + +# Link to the imported func1 and func2 from both native and EC code, and func3 from native code. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test2.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-arm64ec.obj func123-arm64.obj imp-arm64x.lib + +RUN: llvm-readobj --coff-imports test2.dll | FileCheck --check-prefix=IMPORTS-123-12 %s +IMPORTS-123-12: Import { +IMPORTS-123-12-NEXT: Name: test.dll +IMPORTS-123-12-NEXT: ImportLookupTableRVA: 0x3338 +IMPORTS-123-12-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-123-12-NEXT: Symbol: func3 (0) +IMPORTS-123-12-NEXT: Symbol: func1 (0) +IMPORTS-123-12-NEXT: Symbol: func2 (0) +IMPORTS-123-12-NEXT: } +IMPORTS-123-12-NEXT: HybridObject { +IMPORTS-123-12: Import { +IMPORTS-123-12-NEXT: Name: test.dll +IMPORTS-123-12-NEXT: ImportLookupTableRVA: 0x3340 +IMPORTS-123-12-NEXT: ImportAddressTableRVA: 0x2008 +IMPORTS-123-12-NEXT: Symbol: func1 (0) +IMPORTS-123-12-NEXT: Symbol: func2 (0) +IMPORTS-123-12-NEXT: } +IMPORTS-123-12-NEXT: } + +RUN: llvm-readobj --hex-dump=.test test2.dll | FileCheck --check-prefix=TEST-123-12 %s +TEST-123-12: 0x180007000 08400000 08200000 10400000 10200000 + +RUN: llvm-readobj --hex-dump=.testa test2.dll | FileCheck --check-prefix=TEST-123-12A %s +TEST-123-12A: 0x180008000 08200000 10200000 00200000 + +RUN: llvm-readobj --hex-dump=.rdata test2.dll | FileCheck --check-prefix=TEST-123-12AUX %s +TEST-123-12AUX: 0x180004000 00000000 00000000 08100080 01000000 +TEST-123-12AUX-NEXT: 0x180004010 1c100080 01000000 00000000 00000000 + + +# Link to the imported func1 and func2 from both native and EC code, and func3 from EC code. + +RUN: lld-link -machine:arm64x -dll -noentry -out:func-12-123.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func123-arm64ec.obj func12-arm64.obj imp-arm64x.lib + +RUN: llvm-readobj --coff-imports func-12-123.dll | FileCheck --check-prefix=IMPORTS-12-123 %s +IMPORTS-12-123: Import { +IMPORTS-12-123-NEXT: Name: test.dll +IMPORTS-12-123-NEXT: ImportLookupTableRVA: 0x3338 +IMPORTS-12-123-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-12-123-NEXT: Symbol: func1 (0) +IMPORTS-12-123-NEXT: Symbol: func2 (0) +IMPORTS-12-123-NEXT: } +IMPORTS-12-123-NEXT: HybridObject { +IMPORTS-12-123: Import { +IMPORTS-12-123-NEXT: Name: test.dll +IMPORTS-12-123-NEXT: ImportLookupTableRVA: 0x3338 +IMPORTS-12-123-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-12-123-NEXT: Symbol: func1 (0) +IMPORTS-12-123-NEXT: Symbol: func2 (0) +IMPORTS-12-123-NEXT: Symbol: func3 (0) +IMPORTS-12-123-NEXT: } +IMPORTS-12-123-NEXT: } + +RUN: llvm-readobj --hex-dump=.test func-12-123.dll | FileCheck --check-prefix=TEST-12-123 %s +TEST-12-123: 0x180007000 00400000 00200000 08400000 08200000 +TEST-12-123-NEXT: 0x180007010 10400000 10200000 + +RUN: llvm-readobj --hex-dump=.testa func-12-123.dll | FileCheck --check-prefix=TEST-12-123A %s +TEST-12-123A: 0x180008000 00200000 08200000 + +RUN: llvm-readobj --hex-dump=.rdata func-12-123.dll | FileCheck --check-prefix=TEST-12-123AUX %s +TEST-12-123AUX: 0x180004000 08100080 01000000 1c100080 01000000 +TEST-12-123AUX-NEXT: 0x180004010 30100080 01000000 00000000 00000000 + + +# Link to the imported func2 and func3 from both native and EC code, func4 from native code, +# and func1 from EC code. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-234-123.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func123-arm64ec.obj func234-arm64.obj imp-arm64x.lib + +RUN: llvm-readobj --coff-imports test-234-123.dll | FileCheck --check-prefix=IMPORTS-234-123 %s +IMPORTS-234-123: Import { +IMPORTS-234-123-NEXT: Name: test.dll +IMPORTS-234-123-NEXT: ImportLookupTableRVA: 0x3338 +IMPORTS-234-123-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-234-123-NEXT: Symbol: func4 (0) +IMPORTS-234-123-NEXT: Symbol: func2 (0) +IMPORTS-234-123-NEXT: Symbol: func3 (0) +IMPORTS-234-123-NEXT: } +IMPORTS-234-123-NEXT: HybridObject { +IMPORTS-234-123: Import { +IMPORTS-234-123-NEXT: Name: test.dll +IMPORTS-234-123-NEXT: ImportLookupTableRVA: 0x3340 +IMPORTS-234-123-NEXT: ImportAddressTableRVA: 0x2008 +IMPORTS-234-123-NEXT: Symbol: func2 (0) +IMPORTS-234-123-NEXT: Symbol: func3 (0) +IMPORTS-234-123-NEXT: Symbol: func1 (0) +IMPORTS-234-123-NEXT: } +IMPORTS-234-123-NEXT: } + +RUN: llvm-readobj --hex-dump=.test test-234-123.dll | FileCheck --check-prefix=TEST-234-123 %s +TEST-234-123: 0x180007000 18400000 18200000 08400000 08200000 +TEST-234-123-NEXT: 0x180007010 10400000 1020000 + +RUN: llvm-readobj --hex-dump=.testa test-234-123.dll | FileCheck --check-prefix=TEST-234-123A %s +TEST-234-123A: 0x180008000 08200000 10200000 00200000 + + +# Link to the imported func3 and func4 from native code, and func1 and func2 from EC code. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-34-12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12o-arm64ec.obj func34o-arm64.obj imp-arm64x.lib imp2.lib + +RUN: llvm-readobj --coff-imports test-34-12.dll | FileCheck --check-prefix=IMPORTS-34-12 %s +IMPORTS-34-12: Import { +IMPORTS-34-12-NEXT: Name: test.dll +IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3350 +IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-34-12-NEXT: Symbol: func3 (0) +IMPORTS-34-12-NEXT: Symbol: func4 (0) +IMPORTS-34-12-NEXT: } +IMPORTS-34-12-NEXT: Import { +IMPORTS-34-12-NEXT: Name: test2.dll +IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3378 +IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2028 +IMPORTS-34-12-NEXT: Symbol: otherfunc (0) +IMPORTS-34-12-NEXT: } +IMPORTS-34-12-NEXT: HybridObject { +IMPORTS-34-12: Import { +IMPORTS-34-12-NEXT: Name: test.dll +IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3360 +IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2010 +IMPORTS-34-12-NEXT: Symbol: func1 (0) +IMPORTS-34-12-NEXT: Symbol: func2 (0) +IMPORTS-34-12-NEXT: } +IMPORTS-34-12-NEXT: Import { +IMPORTS-34-12-NEXT: Name: test2.dll +IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3378 +IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2028 +IMPORTS-34-12-NEXT: Symbol: otherfunc (0) +IMPORTS-34-12-NEXT: } +IMPORTS-34-12-NEXT: } + +RUN: llvm-readobj --hex-dump=.test test-34-12.dll | FileCheck --check-prefix=TEST-23-12 %s +TEST-23-12: 0x180007000 10400000 10200000 18400000 18200000 +TEST-23-12-NEXT: 0x180007010 28400000 28200000 + +RUN: llvm-readobj --hex-dump=.testa test-34-12.dll | FileCheck --check-prefix=TEST-23-12A %s +TEST-23-12A: 0x180008000 00200000 08200000 28200000 + + +# Link only to imported EC functions, with no native imports. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-ec12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-arm64ec.obj funco-arm64.obj imp-arm64x.lib imp2.lib + +RUN: llvm-readobj --coff-imports test-ec12.dll | FileCheck --check-prefix=IMPORTS-EC12 %s + +IMPORTS-EC12: File: test-ec12.dll +IMPORTS-EC12-NEXT: Format: COFF-ARM64X +IMPORTS-EC12-NEXT: Arch: aarch64 +IMPORTS-EC12-NEXT: AddressSize: 64bit +IMPORTS-EC12-NEXT: Import { +IMPORTS-EC12-NEXT: Name: test.dll +IMPORTS-EC12-NEXT: ImportLookupTableRVA: 0x3350 +IMPORTS-EC12-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-EC12-NEXT: } +IMPORTS-EC12-NEXT: Import { +IMPORTS-EC12-NEXT: Name: test2.dll +IMPORTS-EC12-NEXT: ImportLookupTableRVA: 0x3368 +IMPORTS-EC12-NEXT: ImportAddressTableRVA: 0x2018 +IMPORTS-EC12-NEXT: Symbol: otherfunc (0) +IMPORTS-EC12-NEXT: } +IMPORTS-EC12-NEXT: HybridObject { +IMPORTS-EC12-NEXT: Format: COFF-ARM64EC +IMPORTS-EC12-NEXT: Arch: aarch64 +IMPORTS-EC12-NEXT: AddressSize: 64bit +IMPORTS-EC12-NEXT: Import { +IMPORTS-EC12-NEXT: Name: test.dll +IMPORTS-EC12-NEXT: ImportLookupTableRVA: 0x3350 +IMPORTS-EC12-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-EC12-NEXT: Symbol: func1 (0) +IMPORTS-EC12-NEXT: Symbol: func2 (0) +IMPORTS-EC12-NEXT: } +IMPORTS-EC12-NEXT: Import { +IMPORTS-EC12-NEXT: Name: test2.dll +IMPORTS-EC12-NEXT: ImportLookupTableRVA: 0x3370 +IMPORTS-EC12-NEXT: ImportAddressTableRVA: 0x2020 +IMPORTS-EC12-NEXT: } +IMPORTS-EC12-NEXT: } + + +# Link only to imported native functions, with no EC imports. + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-n12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: func12-arm64.obj imp-arm64x.lib + +RUN: llvm-readobj --coff-imports test-n12.dll | FileCheck --check-prefix=IMPORTS-N12 %s + +IMPORTS-N12: Arch: aarch64 +IMPORTS-N12-NEXT: AddressSize: 64bit +IMPORTS-N12-NEXT: Import { +IMPORTS-N12-NEXT: Name: test.dll +IMPORTS-N12-NEXT: ImportLookupTableRVA: 0x2330 +IMPORTS-N12-NEXT: ImportAddressTableRVA: 0x1000 +IMPORTS-N12-NEXT: Symbol: func1 (0) +IMPORTS-N12-NEXT: Symbol: func2 (0) +IMPORTS-N12-NEXT: } +IMPORTS-N12-NEXT: HybridObject { +IMPORTS-N12-NEXT: Format: COFF-ARM64EC +IMPORTS-N12-NEXT: Arch: aarch64 +IMPORTS-N12-NEXT: AddressSize: 64bit +IMPORTS-N12-NEXT: Import { +IMPORTS-N12-NEXT: Name: test.dll +IMPORTS-N12-NEXT: ImportLookupTableRVA: 0x2340 +IMPORTS-N12-NEXT: ImportAddressTableRVA: 0x1010 +IMPORTS-N12-NEXT: } +IMPORTS-N12-NEXT: } + + +RUN: lld-link -machine:arm64x -dll -noentry -out:test-dup.dll loadconfig-arm64.obj loadconfig-arm64ec.obj icall.obj \ +RUN: func12-arm64ec.obj func34-arm64.obj dup.lib + +RUN: llvm-readobj --coff-imports test-dup.dll | FileCheck --check-prefix=IMPORTS-DUP %s +IMPORTS-DUP: Format: COFF-ARM64X +IMPORTS-DUP-NEXT: Arch: aarch64 +IMPORTS-DUP-NEXT: AddressSize: 64bit +IMPORTS-DUP-NEXT: Import { +IMPORTS-DUP-NEXT: Name: test.dll +IMPORTS-DUP-NEXT: ImportLookupTableRVA: 0x3338 +IMPORTS-DUP-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-DUP-NEXT: Symbol: func4 (0) +IMPORTS-DUP-NEXT: Symbol: func4 (0) +IMPORTS-DUP-NEXT: } +IMPORTS-DUP-NEXT: HybridObject { +IMPORTS-DUP-NEXT: Format: COFF-ARM64EC +IMPORTS-DUP-NEXT: Arch: aarch64 +IMPORTS-DUP-NEXT: AddressSize: 64bit +IMPORTS-DUP-NEXT: Import { +IMPORTS-DUP-NEXT: Name: test.dll +IMPORTS-DUP-NEXT: ImportLookupTableRVA: 0x3348 +IMPORTS-DUP-NEXT: ImportAddressTableRVA: 0x2010 +IMPORTS-DUP-NEXT: Symbol: func1 (0) +IMPORTS-DUP-NEXT: Symbol: func1 (0) +IMPORTS-DUP-NEXT: } +IMPORTS-DUP-NEXT: } + +#--- func12-thunks-arm64ec.s + .section .test, "r" + .rva __imp_func1 + .rva __imp_aux_func1 + .rva func1 + .rva "#func1" + .rva __imp_func2 + .rva __imp_aux_func2 + +#--- func12-thunks-arm64.s + .section .testa, "r" + .rva __imp_func1 + .rva __imp_func2 + .rva func2 + +#--- func12-arm64ec.s + .section .test, "r" + .rva __imp_func1 + .rva __imp_aux_func1 + .rva __imp_func2 + .rva __imp_aux_func2 + +#--- func123-arm64.s + .section .testa, "r" + .rva __imp_func1 + .rva __imp_func2 + .rva __imp_func3 + +#--- func123-arm64ec.s + .section .test, "r" + .rva __imp_func1 + .rva __imp_aux_func1 + .rva __imp_func2 + .rva __imp_aux_func2 + .rva __imp_func3 + .rva __imp_aux_func3 + +#--- func12-arm64.s + .section .testa, "r" + .rva __imp_func1 + .rva __imp_func2 + +#--- func234-arm64.s + .section .testa, "r" + .rva __imp_func2 + .rva __imp_func3 + .rva __imp_func4 + +#--- func12o-arm64ec.s + .section .test, "r" + .rva __imp_func1 + .rva __imp_aux_func1 + .rva __imp_func2 + .rva __imp_aux_func2 + .rva __imp_otherfunc + .rva __imp_aux_otherfunc + +#--- func34-arm64.s + .section .testa, "r" + .rva __imp_func3 + .rva __imp_func4 + +#--- func34o-arm64.s + .section .testa, "r" + .rva __imp_func3 + .rva __imp_func4 + .rva __imp_otherfunc + +#--- funco-arm64.s + .section .testa, "r" + .rva __imp_otherfunc + +#--- icall.s + .text + .globl __icall_helper_arm64ec + .p2align 2, 0x0 +__icall_helper_arm64ec: + mov w0, #2 + ret + + .section .hybmp$x, "yi" + .symidx __imp_func1 + .symidx func1_exit_thunk + .word 4 + + .section .wowthk$aa,"xr",discard,func1_exit_thunk + .globl func1_exit_thunk +func1_exit_thunk: + mov w0, #3 + ret + +#--- imp.def +NAME test.dll +EXPORTS + data_sym DATA + func1 + func2 + func3 + func4 + +#--- imp-ord10.def +NAME test.dll +EXPORTS + data_sym DATA @10 + func1 @11 + func2 @12 + func3 @13 + func4 @14 + +#--- imp-ord20.def +NAME test.dll +EXPORTS + data_sym DATA @10 + func1 @21 + func2 @22 + func3 @23 + func4 @24 + +#--- imp2.def +NAME test2.dll +EXPORTS + otherfunc + +#--- noname-ec.def +NAME test.dll +EXPORTS + func1 @10 NONAME + func2 @11 NONAME + +#--- noname-native.def +NAME test.dll +EXPORTS + func1 @12 NONAME + func2 @11 NONAME + +#--- dup-ec.def +NAME test.dll +EXPORTS + func1 + func2 EXPORTAS func1 + +#--- dup-native.def +NAME test.dll +EXPORTS + func3 EXPORTAS func4 + func4 From 80ab237c1187aa7e8a1f546175887d768fa14e2d Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Sun, 26 Jan 2025 22:36:01 +0100 Subject: [PATCH 147/432] [LLD][COFF] Add REQUIRE x86 to arm64x-import.test (NFC) This ensures the disassembler can handle ARM64X binaries correctly. Fixes #124189. --- lld/test/COFF/arm64x-import.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/COFF/arm64x-import.test b/lld/test/COFF/arm64x-import.test index bc202e1d17251..7441c71d87710 100644 --- a/lld/test/COFF/arm64x-import.test +++ b/lld/test/COFF/arm64x-import.test @@ -1,4 +1,4 @@ -REQUIRES: aarch64 +REQUIRES: aarch64, x86 RUN: split-file %s %t.dir && cd %t.dir RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12-thunks-arm64ec.s -o func12-thunks-arm64ec.obj From e278e1b6ece025ace4238748c0f57fda3ca833f9 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Sun, 26 Jan 2025 23:58:58 +0200 Subject: [PATCH 148/432] [NFC][CodeGen] Fix typos in code comments. (#124382) This fixes typos in `calcUniqueIDUpdateFlagsAndSize` function. --- llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index fbbd92a2e0ca4..6ab6d18213ba4 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -732,7 +732,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName, // that section can be assigned an incorrect entry size. To avoid this we // usually put symbols of the same size into distinct mergeable sections with // the same name. Doing so relies on the ",unique ," assembly feature. This - // feature is not avalible until bintuils version 2.35 + // feature is not available until binutils version 2.35 // (https://sourceware.org/bugzilla/show_bug.cgi?id=25380). const bool SupportsUnique = Ctx.getAsmInfo()->useIntegratedAssembler() || Ctx.getAsmInfo()->binutilsIsAtLeast(2, 35); @@ -745,7 +745,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName, const bool SymbolMergeable = Flags & ELF::SHF_MERGE; const bool SeenSectionNameBefore = Ctx.isELFGenericMergeableSection(SectionName); - // If this is the first ocurrence of this section name, treat it as the + // If this is the first occurrence of this section name, treat it as the // generic section if (!SymbolMergeable && !SeenSectionNameBefore) { if (TM.getSeparateNamedSections()) From c9637afec7ed72904c74c2fc71e990d378f3d7a6 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Sun, 26 Jan 2025 15:06:26 -0800 Subject: [PATCH 149/432] [Clang] Fix createConstexprUnknownAPValues to use zero offset when ceating APValue (#124478) When implmenting P2280R4 here: https://github.com/llvm/llvm-project/pull/95474 When creating the APValue to store and constexprUnknown value I used an offset of CharUnits::One() but it should have been CharUnits::Zero(). This change just adjusts that value. --- clang/lib/AST/ExprConstant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3b5ab839c6cf7..be8f1fe02e721 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -1961,7 +1961,7 @@ APValue & CallStackFrame::createConstexprUnknownAPValues(const VarDecl *Key, APValue::LValueBase Base) { APValue &Result = ConstexprUnknownAPValues[MapKeyTy(Key, Base.getVersion())]; - Result = APValue(Base, CharUnits::One(), APValue::ConstexprUnknown{}); + Result = APValue(Base, CharUnits::Zero(), APValue::ConstexprUnknown{}); return Result; } From bfa7de0df5d8eb8dd284b0f49f10e7f0cd850693 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Mon, 27 Jan 2025 07:27:26 +0800 Subject: [PATCH 150/432] X86: Support FCANONICALIZE on f64/f80 for i686 with SSE2 or AVX (#123917) Currently, FCANONICALIZE is not enabled for f64 with SSE2, and is not enabled for f80 for 32bit system. Let's enable them. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +- llvm/test/CodeGen/X86/canonicalize-vars.ll | 336 ++++++++++++++++++++- 2 files changed, 335 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 23731212a420c..ce3c140af8105 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -334,10 +334,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); } } if (Subtarget.hasAVX10_2()) { @@ -367,7 +367,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); @@ -889,6 +888,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); + setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom); if (isTypeLegal(MVT::f16)) { setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll index 951ea1b72f439..67213b38277dc 100644 --- a/llvm/test/CodeGen/X86/canonicalize-vars.ll +++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 ; RUN: llc -mtriple=i686-- < %s | FileCheck %s -check-prefixes=X87 -; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE,SSE2 +; RUN: llc -mattr=+sse2 -mtriple=i686-- < %s | FileCheck %s -check-prefixes=X86-SSE +; RUN: llc -mattr=+avx -mtriple=i686-- < %s | FileCheck %s -check-prefixes=X86-AVX +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE ; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX1 ; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX2 ; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX512F @@ -12,6 +14,30 @@ define float @canon_fp32_varargsf32(float %a) { ; X87-NEXT: fmuls {{[0-9]+}}(%esp) ; X87-NEXT: retl ; +; X86-SSE-LABEL: canon_fp32_varargsf32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canon_fp32_varargsf32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, (%esp) +; X86-AVX-NEXT: flds (%esp) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canon_fp32_varargsf32: ; SSE: # %bb.0: ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -33,6 +59,20 @@ define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) { ; X87-NEXT: fmulp %st, %st(1) ; X87-NEXT: retl ; +; X86-SSE-LABEL: canon_fp32_varargsf80: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: fldt {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fld1 +; X86-SSE-NEXT: fmulp %st, %st(1) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canon_fp32_varargsf80: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: fldt {{[0-9]+}}(%esp) +; X86-AVX-NEXT: fld1 +; X86-AVX-NEXT: fmulp %st, %st(1) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canon_fp32_varargsf80: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%rsp) @@ -64,6 +104,32 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) { ; X87-NEXT: fsubp %st, %st(1) ; X87-NEXT: retl ; +; X86-SSE-LABEL: complex_canonicalize_fmul_x86_fp80: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: fldt {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fldt {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fsub %st(1), %st +; X86-SSE-NEXT: fld %st(0) +; X86-SSE-NEXT: fadd %st(2), %st +; X86-SSE-NEXT: fsubp %st, %st(1) +; X86-SSE-NEXT: fld1 +; X86-SSE-NEXT: fmulp %st, %st(1) +; X86-SSE-NEXT: fsubp %st, %st(1) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: complex_canonicalize_fmul_x86_fp80: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: fldt {{[0-9]+}}(%esp) +; X86-AVX-NEXT: fldt {{[0-9]+}}(%esp) +; X86-AVX-NEXT: fsub %st(1), %st +; X86-AVX-NEXT: fld %st(0) +; X86-AVX-NEXT: fadd %st(2), %st +; X86-AVX-NEXT: fsubp %st, %st(1) +; X86-AVX-NEXT: fld1 +; X86-AVX-NEXT: fmulp %st, %st(1) +; X86-AVX-NEXT: fsubp %st, %st(1) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: complex_canonicalize_fmul_x86_fp80: ; SSE: # %bb.0: # %entry ; SSE-NEXT: fldt {{[0-9]+}}(%rsp) @@ -130,6 +196,54 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 { ; X87-NEXT: fmulp %st, %st(1) ; X87-NEXT: retl ; +; X86-SSE-LABEL: canonicalize_fp64: +; X86-SSE: # %bb.0: # %start +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %ebp, -8 +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE-NEXT: andl $-8, %esp +; X86-SSE-NEXT: subl $8, %esp +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: movapd %xmm0, %xmm2 +; X86-SSE-NEXT: cmpunordsd %xmm0, %xmm2 +; X86-SSE-NEXT: movapd %xmm2, %xmm3 +; X86-SSE-NEXT: andpd %xmm1, %xmm3 +; X86-SSE-NEXT: maxsd %xmm0, %xmm1 +; X86-SSE-NEXT: andnpd %xmm1, %xmm2 +; X86-SSE-NEXT: orpd %xmm3, %xmm2 +; X86-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE-NEXT: movsd %xmm2, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canonicalize_fp64: +; X86-AVX: # %bb.0: # %start +; X86-AVX-NEXT: pushl %ebp +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %ebp, -8 +; X86-AVX-NEXT: movl %esp, %ebp +; X86-AVX-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX-NEXT: andl $-8, %esp +; X86-AVX-NEXT: subl $8, %esp +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; X86-AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX-NEXT: fldl (%esp) +; X86-AVX-NEXT: movl %ebp, %esp +; X86-AVX-NEXT: popl %ebp +; X86-AVX-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canonicalize_fp64: ; SSE: # %bb.0: # %start ; SSE-NEXT: movapd %xmm0, %xmm2 @@ -207,6 +321,42 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 { ; X87-NEXT: fmulp %st, %st(1) ; X87-NEXT: retl ; +; X86-SSE-LABEL: canonicalize_fp32: +; X86-SSE: # %bb.0: # %start +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movaps %xmm0, %xmm2 +; X86-SSE-NEXT: cmpunordss %xmm0, %xmm2 +; X86-SSE-NEXT: movaps %xmm2, %xmm3 +; X86-SSE-NEXT: andps %xmm1, %xmm3 +; X86-SSE-NEXT: maxss %xmm0, %xmm1 +; X86-SSE-NEXT: andnps %xmm1, %xmm2 +; X86-SSE-NEXT: orps %xmm3, %xmm2 +; X86-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE-NEXT: movss %xmm2, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canonicalize_fp32: +; X86-AVX: # %bb.0: # %start +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; X86-AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X86-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, (%esp) +; X86-AVX-NEXT: flds (%esp) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canonicalize_fp32: ; SSE: # %bb.0: # %start ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -261,6 +411,22 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { ; X87-NEXT: fstps (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: v_test_canonicalize_var_f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: v_test_canonicalize_var_f32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovss %xmm0, (%eax) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: v_test_canonicalize_var_f32: ; SSE: # %bb.0: ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -290,6 +456,24 @@ define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 { ; X87-NEXT: fstpt (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: v_test_canonicalize_x86_fp80: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: fldt (%eax) +; X86-SSE-NEXT: fld1 +; X86-SSE-NEXT: fmulp %st, %st(1) +; X86-SSE-NEXT: fstpt (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: v_test_canonicalize_x86_fp80: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: fldt (%eax) +; X86-AVX-NEXT: fld1 +; X86-AVX-NEXT: fmulp %st, %st(1) +; X86-AVX-NEXT: fstpt (%eax) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: v_test_canonicalize_x86_fp80: ; SSE: # %bb.0: ; SSE-NEXT: fldt (%rdi) @@ -320,6 +504,22 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { ; X87-NEXT: fstpl (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: v_test_canonicalize_var_f64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: v_test_canonicalize_var_f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovsd %xmm0, (%eax) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: v_test_canonicalize_var_f64: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -347,6 +547,20 @@ define void @canonicalize_undef(double addrspace(1)* %out) { ; X87-NEXT: movl $0, (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: canonicalize_undef: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl $2146959360, 4(%eax) # imm = 0x7FF80000 +; X86-SSE-NEXT: movl $0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canonicalize_undef: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl $2146959360, 4(%eax) # imm = 0x7FF80000 +; X86-AVX-NEXT: movl $0, (%eax) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canonicalize_undef: ; SSE: # %bb.0: ; SSE-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 @@ -384,6 +598,16 @@ define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { ; X87-NEXT: fstps (%eax) ; X87-NEXT: retl $4 ; +; X86-SSE-LABEL: canon_fp32_varargsv4f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canon_fp32_varargsv4f32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canon_fp32_varargsv4f32: ; SSE: # %bb.0: ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -430,6 +654,18 @@ define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) { ; X87-NEXT: fstpl (%eax) ; X87-NEXT: retl $4 ; +; X86-SSE-LABEL: canon_fp64_varargsv4f64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0] +; X86-SSE-NEXT: mulpd %xmm2, %xmm0 +; X86-SSE-NEXT: mulpd %xmm2, %xmm1 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canon_fp64_varargsv4f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canon_fp64_varargsv4f64: ; SSE: # %bb.0: ; SSE-NEXT: movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0] @@ -468,6 +704,26 @@ define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) { ; X87-NEXT: fxch %st(1) ; X87-NEXT: retl ; +; X86-SSE-LABEL: canon_fp80_varargsv2fp80: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: fldt {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fldt {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fld1 +; X86-SSE-NEXT: fmul %st, %st(1) +; X86-SSE-NEXT: fmulp %st, %st(2) +; X86-SSE-NEXT: fxch %st(1) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: canon_fp80_varargsv2fp80: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: fldt {{[0-9]+}}(%esp) +; X86-AVX-NEXT: fldt {{[0-9]+}}(%esp) +; X86-AVX-NEXT: fld1 +; X86-AVX-NEXT: fmul %st, %st(1) +; X86-AVX-NEXT: fmulp %st, %st(2) +; X86-AVX-NEXT: fxch %st(1) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: canon_fp80_varargsv2fp80: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%rsp) @@ -512,6 +768,22 @@ define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 { ; X87-NEXT: fstps (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: vec_canonicalize_var_v4f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps (%eax), %xmm0 +; X86-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: vec_canonicalize_var_v4f32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: vec_canonicalize_var_v4f32: ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm0 @@ -566,6 +838,26 @@ define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 { ; X87-NEXT: fstpl (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: vec_canonicalize_var_v4f64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] +; X86-SSE-NEXT: movapd 16(%eax), %xmm1 +; X86-SSE-NEXT: mulpd %xmm0, %xmm1 +; X86-SSE-NEXT: mulpd (%eax), %xmm0 +; X86-SSE-NEXT: movapd %xmm0, (%eax) +; X86-SSE-NEXT: movapd %xmm1, 16(%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: vec_canonicalize_var_v4f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovapd (%eax), %ymm0 +; X86-AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX-NEXT: vmovapd %ymm0, (%eax) +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl +; ; SSE-LABEL: vec_canonicalize_var_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] @@ -626,6 +918,46 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 { ; X87-NEXT: fstpt (%eax) ; X87-NEXT: retl ; +; X86-SSE-LABEL: vec_canonicalize_x86_fp80: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: fldt 30(%eax) +; X86-SSE-NEXT: fldt 20(%eax) +; X86-SSE-NEXT: fldt 10(%eax) +; X86-SSE-NEXT: fldt (%eax) +; X86-SSE-NEXT: fld1 +; X86-SSE-NEXT: fmul %st, %st(1) +; X86-SSE-NEXT: fmul %st, %st(2) +; X86-SSE-NEXT: fmul %st, %st(3) +; X86-SSE-NEXT: fmulp %st, %st(4) +; X86-SSE-NEXT: fxch %st(3) +; X86-SSE-NEXT: fstpt 30(%eax) +; X86-SSE-NEXT: fxch %st(1) +; X86-SSE-NEXT: fstpt 20(%eax) +; X86-SSE-NEXT: fstpt 10(%eax) +; X86-SSE-NEXT: fstpt (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: vec_canonicalize_x86_fp80: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: fldt 30(%eax) +; X86-AVX-NEXT: fldt 20(%eax) +; X86-AVX-NEXT: fldt 10(%eax) +; X86-AVX-NEXT: fldt (%eax) +; X86-AVX-NEXT: fld1 +; X86-AVX-NEXT: fmul %st, %st(1) +; X86-AVX-NEXT: fmul %st, %st(2) +; X86-AVX-NEXT: fmul %st, %st(3) +; X86-AVX-NEXT: fmulp %st, %st(4) +; X86-AVX-NEXT: fxch %st(3) +; X86-AVX-NEXT: fstpt 30(%eax) +; X86-AVX-NEXT: fxch %st(1) +; X86-AVX-NEXT: fstpt 20(%eax) +; X86-AVX-NEXT: fstpt 10(%eax) +; X86-AVX-NEXT: fstpt (%eax) +; X86-AVX-NEXT: retl +; ; SSE-LABEL: vec_canonicalize_x86_fp80: ; SSE: # %bb.0: ; SSE-NEXT: fldt 30(%rdi) @@ -668,5 +1000,3 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 { store <4 x x86_fp80> %canonicalized, <4 x x86_fp80> addrspace(1)* %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE2: {{.*}} From db79fb2a91df31a07f312f8e061936927ac5c506 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Sun, 26 Jan 2025 15:40:55 -0800 Subject: [PATCH 151/432] [msan] Add handlers for AVX masked load/store intrinsics (#123857) This patch adds explicit support for AVX masked load/store intrinsics, largely by applying the intrinsics to the shadows (but subtly different to handleIntrinsicByApplyingToShadow()). We do not reuse the handleMaskedLoad/Store functions. The key challenge is that the LLVM masked intrinsics require a vector of booleans, while AVX masked intrinsics use the MSBs of a vector of integers. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad mentions that the x86 backend does not know how to efficiently convert from a vector of booleans back into the AVX mask format; therefore, they (and we) do not reduce AVX masked intrinsics into LLVM masked intrinsics. --- .../Instrumentation/MemorySanitizer.cpp | 154 ++++++++++++++++- .../MemorySanitizer/X86/avx-intrinsics-x86.ll | 160 ++++++++++-------- .../X86/avx2-intrinsics-x86.ll | 152 +++++++++-------- .../i386/avx-intrinsics-i386.ll | 160 ++++++++++-------- .../i386/avx2-intrinsics-i386.ll | 152 +++++++++-------- 5 files changed, 489 insertions(+), 289 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 56d3eb10d73e9..b6293af4ab477 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3046,7 +3046,8 @@ struct MemorySanitizerVisitor : public InstVisitor { if (maybeHandleSimpleNomemIntrinsic(I)) return true; - // FIXME: detect and handle SSE maskstore/maskload + // FIXME: detect and handle SSE maskstore/maskload? + // Some cases are now handled in handleAVXMasked{Load,Store}. return false; } @@ -3683,6 +3684,10 @@ struct MemorySanitizerVisitor : public InstVisitor { // TODO: Store origin. } + // Intrinsic::masked_store + // + // Note: handleAVXMaskedStore handles AVX/AVX2 variants, though AVX512 masked + // stores are lowered to Intrinsic::masked_store. void handleMaskedStore(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *V = I.getArgOperand(0); @@ -3713,6 +3718,10 @@ struct MemorySanitizerVisitor : public InstVisitor { std::max(Alignment, kMinOriginAlignment)); } + // Intrinsic::masked_load + // + // Note: handleAVXMaskedLoad handles AVX/AVX2 variants, though AVX512 masked + // loads are lowered to Intrinsic::masked_load. void handleMaskedLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Ptr = I.getArgOperand(0); @@ -3754,6 +3763,125 @@ struct MemorySanitizerVisitor : public InstVisitor { setOrigin(&I, Origin); } + // e.g., void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) + // dst mask src + // + // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled + // by handleMaskedStore. + // + // This function handles AVX and AVX2 masked stores; these use the MSBs of a + // vector of integers, unlike the LLVM masked intrinsics, which require a + // vector of booleans. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad + // mentions that the x86 backend does not know how to efficiently convert + // from a vector of booleans back into the AVX mask format; therefore, they + // (and we) do not reduce AVX/AVX2 masked intrinsics into LLVM masked + // intrinsics. + void handleAVXMaskedStore(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + Value *Dst = I.getArgOperand(0); + assert(Dst->getType()->isPointerTy() && "Destination is not a pointer!"); + + Value *Mask = I.getArgOperand(1); + assert(isa(Mask->getType()) && "Mask is not a vector!"); + + Value *Src = I.getArgOperand(2); + assert(isa(Src->getType()) && "Source is not a vector!"); + + const Align Alignment = Align(1); + + Value *SrcShadow = getShadow(Src); + + if (ClCheckAccessAddress) { + insertShadowCheck(Dst, &I); + insertShadowCheck(Mask, &I); + } + + Value *DstShadowPtr; + Value *DstOriginPtr; + std::tie(DstShadowPtr, DstOriginPtr) = getShadowOriginPtr( + Dst, IRB, SrcShadow->getType(), Alignment, /*isStore*/ true); + + SmallVector ShadowArgs; + ShadowArgs.append(1, DstShadowPtr); + ShadowArgs.append(1, Mask); + // The intrinsic may require floating-point but shadows can be arbitrary + // bit patterns, of which some would be interpreted as "invalid" + // floating-point values (NaN etc.); we assume the intrinsic will happily + // copy them. + ShadowArgs.append(1, IRB.CreateBitCast(SrcShadow, Src->getType())); + + CallInst *CI = + IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), ShadowArgs); + setShadow(&I, CI); + + if (!MS.TrackOrigins) + return; + + // Approximation only + auto &DL = F.getDataLayout(); + paintOrigin(IRB, getOrigin(Src), DstOriginPtr, + DL.getTypeStoreSize(SrcShadow->getType()), + std::max(Alignment, kMinOriginAlignment)); + } + + // e.g., <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) + // return src mask + // + // Masked-off values are replaced with 0, which conveniently also represents + // initialized memory. + // + // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled + // by handleMaskedStore. + // + // We do not combine this with handleMaskedLoad; see comment in + // handleAVXMaskedStore for the rationale. + // + // This is subtly different than handleIntrinsicByApplyingToShadow(I, 1) + // because we need to apply getShadowOriginPtr, not getShadow, to the first + // parameter. + void handleAVXMaskedLoad(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + Value *Src = I.getArgOperand(0); + assert(Src->getType()->isPointerTy() && "Source is not a pointer!"); + + Value *Mask = I.getArgOperand(1); + assert(isa(Mask->getType()) && "Mask is not a vector!"); + + const Align Alignment = Align(1); + + if (ClCheckAccessAddress) { + insertShadowCheck(Mask, &I); + } + + Type *SrcShadowTy = getShadowTy(Src); + Value *SrcShadowPtr, *SrcOriginPtr; + std::tie(SrcShadowPtr, SrcOriginPtr) = + getShadowOriginPtr(Src, IRB, SrcShadowTy, Alignment, /*isStore*/ false); + + SmallVector ShadowArgs; + ShadowArgs.append(1, SrcShadowPtr); + ShadowArgs.append(1, Mask); + + CallInst *CI = + IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), ShadowArgs); + // The intrinsic may require floating-point but shadows can be arbitrary + // bit patterns, of which some would be interpreted as "invalid" + // floating-point values (NaN etc.); we assume the intrinsic will happily + // copy them. + setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I))); + + if (!MS.TrackOrigins) + return; + + // The "pass-through" value is always zero (initialized). To the extent + // that that results in initialized aligned 4-byte chunks, the origin value + // is ignored. It is therefore correct to simply copy the origin from src. + Value *PtrSrcOrigin = IRB.CreateLoad(MS.OriginTy, SrcOriginPtr); + setOrigin(&I, PtrSrcOrigin); + } + // Instrument BMI / BMI2 intrinsics. // All of these intrinsics are Z = I(X, Y) // where the types of all operands and the result match, and are either i32 or @@ -4466,6 +4594,30 @@ struct MemorySanitizerVisitor : public InstVisitor { break; } + case Intrinsic::x86_avx_maskstore_ps: + case Intrinsic::x86_avx_maskstore_pd: + case Intrinsic::x86_avx_maskstore_ps_256: + case Intrinsic::x86_avx_maskstore_pd_256: + case Intrinsic::x86_avx2_maskstore_d: + case Intrinsic::x86_avx2_maskstore_q: + case Intrinsic::x86_avx2_maskstore_d_256: + case Intrinsic::x86_avx2_maskstore_q_256: { + handleAVXMaskedStore(I); + break; + } + + case Intrinsic::x86_avx_maskload_ps: + case Intrinsic::x86_avx_maskload_pd: + case Intrinsic::x86_avx_maskload_ps_256: + case Intrinsic::x86_avx_maskload_pd_256: + case Intrinsic::x86_avx2_maskload_d: + case Intrinsic::x86_avx2_maskload_q: + case Intrinsic::x86_avx2_maskload_d_256: + case Intrinsic::x86_avx2_maskload_q_256: { + handleAVXMaskedLoad(I); + break; + } + case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll index 7273e431a9c2a..43f51a810d0d2 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll @@ -532,20 +532,22 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP4]], <2 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <2 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]]) +; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[RES]] ; %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1] @@ -556,20 +558,22 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP4]], <4 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[TMP5]] to <4 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]]) -; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]]) +; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[RES]] ; %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1] @@ -580,20 +584,22 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP4]], <4 x i32> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]]) +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[RES]] ; %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1] @@ -604,20 +610,22 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP4]], <8 x i32> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[TMP5]] to <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]]) +; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x float> [[RES]] ; %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1] @@ -628,23 +636,25 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd( +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[TMP6]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) @@ -655,23 +665,25 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP6]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) @@ -682,23 +694,25 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[TMP6]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) @@ -709,23 +723,25 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP6]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll index e10062142c046..c68461dd367ee 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll @@ -995,20 +995,21 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP4]], <2 x i64> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 8: +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[RES]] ; %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] @@ -1019,20 +1020,21 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP4]], <4 x i64> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]]) -; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 8: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]]) +; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i64> [[RES]] ; %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] @@ -1043,20 +1045,21 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP4]], <4 x i32> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 8: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] @@ -1067,20 +1070,21 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP4]], <8 x i32> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 8: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]]) +; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] @@ -1091,23 +1095,24 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q( +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[TMP6]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]]) +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) @@ -1118,23 +1123,24 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP6]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]]) +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) @@ -1145,23 +1151,24 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[TMP6]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) @@ -1172,23 +1179,24 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP6]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]]) +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll index 68337d6d962db..a22ca6dd15da4 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll @@ -550,21 +550,23 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP11]], <2 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <2 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 10: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]]) +; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[RES]] ; %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1] @@ -575,21 +577,23 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP11]], <4 x i64> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]]) -; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 10: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]]) +; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[RES]] ; %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1] @@ -600,21 +604,23 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP11]], <4 x i32> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 10: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]]) +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[RES]] ; %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1] @@ -625,21 +631,23 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP11]], <8 x i32> [[MASK:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP6]] to <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 10: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]]) +; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x float> [[RES]] ; %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1] @@ -650,24 +658,26 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd( +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[TMP7]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]]) +; CHECK: 11: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) @@ -678,24 +688,26 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP7]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]]) +; CHECK: 11: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) @@ -706,24 +718,26 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[TMP7]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]]) +; CHECK: 11: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) @@ -734,24 +748,26 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float> +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP7]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]]) +; CHECK: 11: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll index 29e2931d2ca48..442f0c422645a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll @@ -1048,21 +1048,22 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP10]], <2 x i64> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[RES]] ; %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] @@ -1073,21 +1074,22 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP10]], <4 x i64> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]]) -; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]]) +; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i64> [[RES]] ; %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] @@ -1098,21 +1100,22 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP10]], <4 x i32> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] @@ -1123,21 +1126,22 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d_256( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP10]], <8 x i32> [[A1:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]]) +; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] @@ -1148,24 +1152,25 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q( +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[TMP7]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) @@ -1176,24 +1181,25 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP7]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) @@ -1204,24 +1210,25 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d( +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[TMP7]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) @@ -1232,24 +1239,25 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256( +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP7]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]]) +; CHECK: 10: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) From 980e86f130eea02bd41b887f4ed896340fc90f6c Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Sun, 26 Jan 2025 16:04:30 -0800 Subject: [PATCH 152/432] [msan] Add avx512-intrinsics.ll and avx512-intrinsics-upgrade.ll test case (#123980) These are forked from the corresponding files in llvm/test/CodeGen/X86/. avx512-intrinsics.ll shows that many intrinsics are already heuristically handled by MSan, and can be used to track refinements to the intrinsic handling. avx512-intrinsics-upgrade.ll tests intrinsics that LLVM "auto-upgrades"; for example, @llvm.x86.avx512.mask.store is converted into @llvm.masked.store (which has the interesting side effect that MemorySanitizer can already handle it via its existing handleMaskedStore). --- .../X86/avx512-intrinsics-upgrade.ll | 19969 ++++++++++++++++ .../MemorySanitizer/X86/avx512-intrinsics.ll | 13714 +++++++++++ 2 files changed, 33683 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll new file mode 100644 index 0000000000000..edb618fdfb8fb --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll @@ -0,0 +1,19969 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll + +declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone + +define i16 @unpckbw_test(i16 %a0, i16 %a1) #0 { +; +; CHECK-LABEL: @unpckbw_test( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i1> [[TMP3]], <16 x i1> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i1> [[TMP6]], <16 x i1> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[_MSPROP1]], <8 x i1> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP2]] to i16 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: store i16 [[TMP10]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP11]] +; + %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) + ret i16 %res +} + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_gpr_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = shufflevector <16 x i32> [[_MSPROP6]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP7]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DOTSPLAT2]], [[X1:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP7]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[DOTSPLAT2]], <16 x i32> [[X1]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = shufflevector <16 x i32> [[_MSPROP8]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT3]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[DOTSPLAT4]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP9]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT10:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[DOTSPLAT4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP5]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[DOTSPLAT]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP18]], <16 x i32> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP10]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP19]], <16 x i32> [[_MSPROP_SELECT10]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP17]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} +declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) + + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_gpr_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = shufflevector <8 x i64> [[_MSPROP6]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP7]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[DOTSPLAT2]], [[X1:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP7]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[DOTSPLAT2]], <8 x i64> [[X1]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = shufflevector <8 x i64> [[_MSPROP8]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP9]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[DOTSPLAT4]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP9]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT10:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[DOTSPLAT4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP5]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[DOTSPLAT]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP18]], <8 x i64> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP10]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP19]], <8 x i64> [[_MSPROP_SELECT10]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP17]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} +declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) + + +declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly + +define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_x86_vbroadcast_ss_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1) + ret <16 x float> %res +} + +define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) #0 { +; +; CHECK-LABEL: @test_x86_mask_vbroadcast_ss_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[A1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[A1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %mask ) #0 { +; +; CHECK-LABEL: @test_x86_maskz_vbroadcast_ss_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly + +define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_vbroadcast_sd_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1) + ret <8 x double> %res +} + +define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) #0 { +; +; CHECK-LABEL: @test_x86_mask_vbroadcast_sd_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[A1:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[A1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %mask ) #0 { +; +; CHECK-LABEL: @test_x86_maskz_vbroadcast_sd_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pbroadcastd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pbroadcastq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) + ret <8 x i64> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_movsldup_512(<16 x float> %x0, <16 x float> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_movsldup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_movsldup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_movsldup_512(<16 x float> %x0, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_movsldup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_movshdup_512(<16 x float> %x0, <16 x float> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_movshdup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_movshdup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_movshdup_512(<16 x float> %x0, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_movshdup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_movddup_512(<8 x double> %x0, <8 x double> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_movddup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_movddup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_movddup_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_perm_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_perm_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_perm_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_perm_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res +} + +define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_store1( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <16 x float> [[DATA]], ptr [[PTR2]], align 1 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr, <16 x float> %data, i16 %mask) + call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr2, <16 x float> %data, i16 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.ps.512(ptr, <16 x float>, i16 ) + +define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_store2( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 1, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <8 x double> [[DATA]], ptr [[PTR2]], align 1 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr, <8 x double> %data, i8 %mask) + call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr2, <8 x double> %data, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.pd.512(ptr, <8 x double>, i8) + +define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_store_aligned_ps( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 64, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64 +; CHECK-NEXT: store <16 x float> [[DATA]], ptr [[PTR2]], align 64 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr, <16 x float> %data, i16 %mask) + call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr2, <16 x float> %data, i16 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.ps.512(ptr, <16 x float>, i16 ) + +define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_store_aligned_pd( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 64, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64 +; CHECK-NEXT: store <8 x double> [[DATA]], ptr [[PTR2]], align 64 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr, <8 x double> %data, i8 %mask) + call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr2, <8 x double> %data, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.pd.512(ptr, <8 x double>, i8) + +define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 1, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <8 x i64> [[X1]], ptr [[PTR2]], align 1 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.q.512(ptr, <8 x i64>, i8) + +define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 1, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <16 x i32> [[X1]], ptr [[PTR2]], align 1 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2) + call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.d.512(ptr, <16 x i32>, i16) + +define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_store_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 64, <8 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64 +; CHECK-NEXT: store <8 x i64> [[X1]], ptr [[PTR2]], align 64 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.q.512(ptr, <8 x i64>, i8) + +define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_store_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 64, <16 x i1> [[TMP6]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64 +; CHECK-NEXT: store <16 x i32> [[X1]], ptr [[PTR2]], align 64 +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2) + call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.d.512(ptr, <16 x i32>, i16) + +define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_aligned_ps( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 +; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] +; CHECK: 24: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 25: +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x float> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES4]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> %res, i16 %mask) + %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res2, %res1 + ret <16 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr, <16 x float>, i16) + +define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_unaligned_ps( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 1, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP10]], <16 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 +; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 1, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] +; CHECK: 24: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 25: +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP19]], <16 x float> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES4]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> %res, i16 %mask) + %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res2, %res1 + ret <16 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr, <16 x float>, i16) + +define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_aligned_pd( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 +; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] +; CHECK: 24: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 25: +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x double> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES4]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> %res, i8 %mask) + %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask) + %res4 = fadd <8 x double> %res2, %res1 + ret <8 x double> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr, <8 x double>, i8) + +define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_unaligned_pd( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 1, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP10]], <8 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 +; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 1, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] +; CHECK: 24: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 25: +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP19]], <8 x double> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES4]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> %res, i8 %mask) + %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask) + %res4 = fadd <8 x double> %res2, %res1 + ret <8 x double> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr, <8 x double>, i8) + +declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr, <16 x i32>, i16) + +define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_unaligned_d( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 17: +; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR2]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP19]] to i16 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP24]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] +; CHECK: 25: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 26: +; CHECK-NEXT: [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i32> [[TMP27]], [[TMP18]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr2, <16 x i32> %res, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask) + %res4 = add <16 x i32> %res2, %res1 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr, <8 x i64>, i8) + +define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_unaligned_q( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP14]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i1> [[TMP10]] to i8 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 17: +; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR2]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP23]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i1> [[TMP19]] to i8 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP24]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] +; CHECK: 25: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 26: +; CHECK-NEXT: [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i64> [[TMP27]], [[TMP18]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr2, <8 x i64> %res, i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask) + %res4 = add <8 x i64> %res2, %res1 + ret <8 x i64> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr, <16 x i32>, i16) + +define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_aligned_d( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 +; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] +; CHECK: 24: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 25: +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i32> [[TMP26]], [[TMP17]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> %res, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask) + %res4 = add <16 x i32> %res2, %res1 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr, <8 x i64>, i8) + +define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_load_aligned_q( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]]) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[TMP5]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 +; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr +; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] +; CHECK: 24: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 25: +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i64> [[TMP26]], [[TMP17]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> %res, i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask) + %res4 = add <8 x i64> %res2, %res1 + ret <8 x i64> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermil_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermil_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pshuf_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X2:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pshuf_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res +} + +define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_pcmpeq_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: store i16 [[TMP10]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP11]] +; + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_pcmpeq_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1) +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 +; CHECK-NEXT: store i16 [[TMP19]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP20]] +; + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16) + +define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_pcmpeq_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[A]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 +; CHECK-NEXT: store i8 [[TMP10]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP11]] +; + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_pcmpeq_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1) +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[A]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 +; CHECK-NEXT: store i8 [[TMP19]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP20]] +; + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8) + +define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_pcmpgt_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <16 x i32> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt <16 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i1> [[TMP13]] to i16 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 +; CHECK-NEXT: store i16 [[TMP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP16]] +; + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_pcmpgt_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt <16 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i1> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <16 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 +; CHECK-NEXT: store i16 [[TMP24]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP25]] +; + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16) + +define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_pcmpgt_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i1> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i64> [[A]], [[B]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i1> [[TMP13]] to i8 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i1> [[TMP14]] to i8 +; CHECK-NEXT: store i8 [[TMP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP16]] +; + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_pcmpgt_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt <8 x i64> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <8 x i64> [[A]], [[B]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i1> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i1> [[TMP14]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i1> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i1> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8 +; CHECK-NEXT: store i8 [[TMP24]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP25]] +; + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) + +declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_unpckh_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP14]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_unpckh_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP14]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_unpckl_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP14]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_unpckl_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP14]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_punpcklqd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklqd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_punpcklqd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_punpckhqd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhqd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_punpckhd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_punpckld_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_punpckld_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_pslli_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_pslli_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP13]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_pslli_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_pslli_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP13]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrli_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrli_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP13]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrli_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrli_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP13]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrai_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrai_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP13]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrai_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrai_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP13]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +declare void @llvm.x86.avx512.storent.q.512(ptr, <8 x i64>) + +define void@test_storent_q_512(<8 x i64> %data, ptr %ptr) #0 { +; +; CHECK-LABEL: @test_storent_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64 +; CHECK-NEXT: store <8 x i64> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2:![0-9]+]] +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.storent.q.512(ptr %ptr, <8 x i64> %data) + ret void +} + +declare void @llvm.x86.avx512.storent.pd.512(ptr, <8 x double>) + +define void @test_storent_pd_512(<8 x double> %data, ptr %ptr) #0 { +; +; CHECK-LABEL: @test_storent_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64 +; CHECK-NEXT: store <8 x double> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]] +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.storent.pd.512(ptr %ptr, <8 x double> %data) + ret void +} + +declare void @llvm.x86.avx512.storent.ps.512(ptr, <16 x float>) + +define void @test_storent_ps_512(<16 x float> %data, ptr %ptr) #0 { +; +; CHECK-LABEL: @test_storent_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP7]], align 64 +; CHECK-NEXT: store <16 x float> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]] +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.storent.ps.512(ptr %ptr, <16 x float> %data) + ret void +} + +define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_xor_epi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_xor_epi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_or_epi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1) +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[A]], [[B]] +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_or_epi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1) +; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[TMP1]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP19]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_and_epi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[A]], [[B]] +; CHECK-NEXT: store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_and_epi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP9]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_xor_epi64( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_xor_epi64( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_or_epi64( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1) +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1) +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[A]], [[B]] +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_or_epi64( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1) +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP1]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[A]], [[B]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_and_epi64( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[A]], [[B]] +; CHECK-NEXT: store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP8]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_and_epi64( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[A]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP9]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mask_add_epi32_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi32_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mask_sub_epi32_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi32_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_mask_add_epi64_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP8]] +; + %b = load <8 x i64>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %b = load <8 x i64>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %b = load <8 x i64>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP8]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_add_epi64_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_mask_sub_epi64_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP8]] +; + %b = load <8 x i64>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %b = load <8 x i64>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %b = load <8 x i64>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP8]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sub_epi64_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mask_mullo_epi32_rr_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rrk_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rrkz_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rm_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rmk_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rmkz_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rmb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rmbk_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) + ret < 16 x i32> %res +} + +define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mullo_epi32_rmbkz_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer +; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %q = load i32, ptr %ptr_b + %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 + %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer + %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) + ret < 16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + + +declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_shuf_f32x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f32x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP14]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_shuf_f64x2( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f64x2( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP14]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_f64x2( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP12]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_shuf_i32x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i32x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X3:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X3]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_shuf_i64x2( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i64x2( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X3:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X3]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) + ret <8 x i64> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_shuf_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP14]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP12]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_shuf_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP14]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) + ret <16 x float> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmaxs_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmaxs_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmaxu_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmaxu_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmins_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmins_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pminu_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pminu_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) #0 { +; +; CHECK-LABEL: @test_mm_mask_move_ss( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[__U:%.*]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP0]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[__U]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = or i8 [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[TMP11]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[__W:%.*]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP]], i32 [[_MSPROP1]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast float [[TMP17]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP18]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP]] +; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP24]], i32 [[TMP19]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP16]], float [[TMP17]], float [[TMP18]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP25]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP26]] +; +entry: + %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U) + ret <4 x float> %res +} + + +define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) #0 { +; +; CHECK-LABEL: @test_mm_maskz_move_ss( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[__U:%.*]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[TMP0]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[__U]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = xor i8 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[TMP7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[TMP10]], -1 +; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0 +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[_MSPROP]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP16]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = xor i32 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP]] +; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP21]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP15]], float [[TMP16]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP22]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP23]] +; +entry: + %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U) + ret <4 x float> %res +} + +define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) #0 { +; +; CHECK-LABEL: @test_mm_mask_move_sd( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[__U:%.*]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP0]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[__U]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = or i8 [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[TMP11]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[__W:%.*]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP16]], i64 [[_MSPROP]], i64 [[_MSPROP1]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast double [[TMP17]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast double [[TMP18]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], [[_MSPROP]] +; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP24]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP16]], double [[TMP17]], double [[TMP18]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP25]], i64 0 +; CHECK-NEXT: store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP26]] +; +entry: + %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U) + ret <2 x double> %res +} + +define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) #0 { +; +; CHECK-LABEL: @test_mm_maskz_move_sd( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[__U:%.*]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[TMP0]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[__U]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = xor i8 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[TMP7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[TMP10]], -1 +; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0 +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP15]], i64 [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast double [[TMP16]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP]] +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[TMP20]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP21]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP15]], double [[TMP16]], double 0.000000e+00 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP22]], i64 0 +; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP23]] +; +entry: + %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8) +declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8) + +declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovzxd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i16> [[TMP2]] to <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i16> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovsxd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxd_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i32> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i16> [[TMP2]] to <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i16> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i16> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>) + +define <8 x i64>@test_int_x86_avx512_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1) + ret <8 x i64> %1 +} + +define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 + ret <8 x i64> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>) + +define <8 x i64>@test_int_x86_avx512_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1) + ret <8 x i64> %1 +} + +define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 + ret <8 x i64> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_prol_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %1 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + %4 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 4) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer + %7 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 5) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %6, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %7, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_prol_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %1 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 + %4 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 4) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer + %7 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 5) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %6, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %7, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_pror_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %1 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + %4 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 4) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer + %7 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 5) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %6, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %7, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_pror_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %1 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 + %4 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 4) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer + %7 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 5) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %6, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %7, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_qi_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0:%.*]], i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 5) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 5) +; CHECK-NEXT: [[TMP17:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 6) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 6) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP18]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP18]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP20]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP19]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[TMP15]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP16]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 6, <8 x i64> zeroinitializer, i8 %x3) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +declare <16 x i32>@llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0:%.*]], i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 5) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 5) +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 6) +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 6) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[TMP18]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[TMP15]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP16]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 6, <16 x i32> zeroinitializer, i16 %x3) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_psra_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0:%.*]], i32 3) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 4) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 5) +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 5) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_psra_qi_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0:%.*]], i32 3) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 4) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 5) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 5) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_psll_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0:%.*]], i32 3) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 4) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 5) +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 5) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_psll_qi_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0:%.*]], i32 3) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 4) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 5) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 5) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psll_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psll_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP19]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psll_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP18]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psll_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psll_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psll_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP18]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrl_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrl_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP19]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP18]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrl_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrl_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP18]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psra_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psra_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP19]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psra_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP18]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psra_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psra_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psra_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP18]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psllv_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psllv_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psllv_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psllv_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrav_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrav_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrav_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrav_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrlv_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrlv_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, ptr %ptr) #0 { +; +; CHECK-LABEL: @test_x86_avx512_psrlv_q_memop( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i64> [[_MSLD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = sext <8 x i1> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP2]], <8 x i64> [[B]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[B]]) +; CHECK-NEXT: store <8 x i64> [[TMP11]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %b = load <8 x i64>, ptr %ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_cvt_dq2pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double> +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[CVT]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_cvt_udq2pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double> +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[CVT]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) + ret <8 x double> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) #0 { +; CHECK-LABEL: @test_x86_vcvtph2ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) #0 { +; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> [[A1:%.*]], i16 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly + +define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) #0 { +; CHECK-LABEL: @test_valign_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[PALIGNR]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_valign_q( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[PALIGNR]], [[SRC:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[PALIGNR]], <8 x i64> [[SRC]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) + +define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_valign_d( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[PALIGNR:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> [[A:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[PALIGNR]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[PALIGNR]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) + +declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP7]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP18]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + + +define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res +} + +; Test case to make sure we can print shuffle decode comments for constant pool loads. +define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[X2]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] +; CHECK: 18: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 19: +; CHECK-NEXT: [[TMP20:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> ) +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x float> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = or <16 x i32> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP27]], <16 x i32> [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[TMP20]], <16 x float> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP29]], 0 +; CHECK-NEXT: br i1 [[_MSCMP4]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], [[_MSPROP_SELECT1]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[TMP16]], [[TMP28]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> zeroinitializer, [[_MSPROP]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP32]], [[RES3]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES4]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> zeroinitializer, i16 %x3) + %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 -1) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mask_mul_epi32_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) +; CHECK-NEXT: [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) +; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]] +; CHECK-NEXT: [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]] +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP28]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) +; CHECK-NEXT: [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP27]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP24]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP33]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] +; CHECK-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP32]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP25]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP34]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmbk_buildvector( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1 +; CHECK-NEXT: [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2 +; CHECK-NEXT: [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3 +; CHECK-NEXT: [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4 +; CHECK-NEXT: [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5 +; CHECK-NEXT: [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6 +; CHECK-NEXT: [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7 +; CHECK-NEXT: [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP8:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP8]] +; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP34]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0 + %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1 + %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2 + %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3 + %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4 + %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5 + %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6 + %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7 + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP33]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epi32_rmbkz_buildvector( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1 +; CHECK-NEXT: [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2 +; CHECK-NEXT: [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3 +; CHECK-NEXT: [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4 +; CHECK-NEXT: [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5 +; CHECK-NEXT: [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6 +; CHECK-NEXT: [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7 +; CHECK-NEXT: [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP8:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP8]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP33]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0 + %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1 + %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2 + %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3 + %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4 + %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5 + %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6 + %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7 + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) + +define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mask_mul_epu32_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]] +; CHECK-NEXT: [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]] +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP28]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP27]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP24]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP33]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] +; CHECK-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP32]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP25]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]] +; CHECK-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP34]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_mul_epu32_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP33]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) + +define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_vextractf32x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[A:%.*]], <16 x float> [[A]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP12]], <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[B]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP13]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) + +define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_vextracti64x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> [[A]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i64> [[TMP4]], [[B:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP10]], <4 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP4]], <4 x i64> [[B]] +; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP11]] +; + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask) + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) + +define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_vextracti32x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[A]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i32> [[TMP10]] +; + %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) + +define <4 x double> @test_vextractf64x4(<8 x double> %a) #0 { +; CHECK-LABEL: @test_vextractf64x4( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> [[A]], <4 x i32> +; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) + +declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_insertf32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP3]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP4]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_insertf32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP14]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> [[X3]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP15]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP12]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + ret <16 x float> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_inserti32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_inserti32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X3:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X3]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP13]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + ret <16 x i32> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_insertf64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP3]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP4]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_insertf64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP14]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> [[X3]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP15]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP12]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_inserti64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_inserti64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X3:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X3]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP13]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_movntdqa(ptr %a0) #0 { +; +; CHECK-LABEL: @test_x86_avx512_movntdqa( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] +; CHECK: 2: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[A0:%.*]], align 64, !nontemporal [[META2]] +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 +; CHECK-NEXT: store <8 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %res = call <8 x i64> @llvm.x86.avx512.movntdqa(ptr %a0) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly + +define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_cmp_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP12]], [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i32> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <16 x i32> [[TMP15]], [[TMP18]] +; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i1> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP24]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP27:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP28:%.*]] = and <16 x i32> [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i32> [[TMP26]], [[TMP1]] +; CHECK-NEXT: [[TMP30:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP31:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP32:%.*]] = and <16 x i32> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = or <16 x i32> [[TMP30]], [[TMP2]] +; CHECK-NEXT: [[TMP34:%.*]] = icmp ule <16 x i32> [[TMP28]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp ule <16 x i32> [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = xor <16 x i1> [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i1> [[TMP36]] to i16 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP38]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP39]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3 +; CHECK-NEXT: [[TMP40:%.*]] = xor <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i32> [[TMP41]], zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = xor <16 x i32> [[TMP41]], splat (i32 -1) +; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i32> [[TMP43]], [[TMP40]] +; CHECK-NEXT: [[TMP45:%.*]] = icmp eq <16 x i32> [[TMP44]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP42]], [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP47:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16 +; CHECK-NEXT: [[TMP48:%.*]] = bitcast <16 x i1> [[TMP46]] to i16 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP47]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP48]], i32 4 +; CHECK-NEXT: [[TMP49:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP50:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP51:%.*]] = and <16 x i32> [[TMP49]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = or <16 x i32> [[TMP49]], [[TMP1]] +; CHECK-NEXT: [[TMP53:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP54:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP55:%.*]] = and <16 x i32> [[TMP53]], [[TMP54]] +; CHECK-NEXT: [[TMP56:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]] +; CHECK-NEXT: [[TMP57:%.*]] = icmp uge <16 x i32> [[TMP51]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = icmp uge <16 x i32> [[TMP52]], [[TMP55]] +; CHECK-NEXT: [[TMP59:%.*]] = xor <16 x i1> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP61:%.*]] = bitcast <16 x i1> [[TMP59]] to i16 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast <16 x i1> [[TMP60]] to i16 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP61]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP62]], i32 5 +; CHECK-NEXT: [[TMP63:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP64:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP65:%.*]] = and <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = or <16 x i32> [[TMP63]], [[TMP1]] +; CHECK-NEXT: [[TMP67:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP68:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP69:%.*]] = and <16 x i32> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = or <16 x i32> [[TMP67]], [[TMP2]] +; CHECK-NEXT: [[TMP71:%.*]] = icmp ugt <16 x i32> [[TMP65]], [[TMP70]] +; CHECK-NEXT: [[TMP72:%.*]] = icmp ugt <16 x i32> [[TMP66]], [[TMP69]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i1> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP75:%.*]] = bitcast <16 x i1> [[TMP73]] to i16 +; CHECK-NEXT: [[TMP76:%.*]] = bitcast <16 x i1> [[TMP74]] to i16 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP75]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP76]], i32 6 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7 +; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[VEC7]] +; + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_cmp_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1) +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i32> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP21]], [[TMP1]] +; CHECK-NEXT: [[TMP25:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP27:%.*]] = and <16 x i32> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = or <16 x i32> [[TMP25]], [[TMP2]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <16 x i32> [[TMP24]], [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = xor <16 x i1> [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i1> [[TMP31]], [[TMP34]] +; CHECK-NEXT: [[TMP38:%.*]] = or <16 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = or <16 x i1> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = and <16 x i1> [[TMP32]], [[TMP34]] +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP41]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP42]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP44:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP45:%.*]] = and <16 x i32> [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or <16 x i32> [[TMP43]], [[TMP1]] +; CHECK-NEXT: [[TMP47:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i32> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i32> [[TMP47]], [[TMP2]] +; CHECK-NEXT: [[TMP51:%.*]] = icmp ule <16 x i32> [[TMP45]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = icmp ule <16 x i32> [[TMP46]], [[TMP49]] +; CHECK-NEXT: [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP55:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP56:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP57:%.*]] = and <16 x i1> [[TMP53]], [[TMP55]] +; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP54]], [[TMP55]] +; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP53]], [[TMP56]] +; CHECK-NEXT: [[TMP60:%.*]] = or <16 x i1> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]] +; CHECK-NEXT: [[TMP62:%.*]] = and <16 x i1> [[TMP54]], [[TMP56]] +; CHECK-NEXT: [[TMP63:%.*]] = bitcast <16 x i1> [[TMP61]] to i16 +; CHECK-NEXT: [[TMP64:%.*]] = bitcast <16 x i1> [[TMP62]] to i16 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP63]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP64]], i32 2 +; CHECK-NEXT: [[TMP65:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP67:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]] +; CHECK-NEXT: [[TMP69:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]] +; CHECK-NEXT: [[TMP70:%.*]] = or <16 x i1> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = or <16 x i1> [[TMP70]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]] +; CHECK-NEXT: [[TMP73:%.*]] = bitcast <16 x i1> [[TMP71]] to i16 +; CHECK-NEXT: [[TMP74:%.*]] = bitcast <16 x i1> [[TMP72]] to i16 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP73]], i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP74]], i32 3 +; CHECK-NEXT: [[TMP75:%.*]] = xor <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP76:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <16 x i32> [[TMP76]], zeroinitializer +; CHECK-NEXT: [[TMP78:%.*]] = xor <16 x i32> [[TMP76]], splat (i32 -1) +; CHECK-NEXT: [[TMP79:%.*]] = and <16 x i32> [[TMP78]], [[TMP75]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq <16 x i32> [[TMP79]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP77]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP82:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP83:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP84:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP82]] +; CHECK-NEXT: [[TMP85:%.*]] = and <16 x i1> [[TMP81]], [[TMP82]] +; CHECK-NEXT: [[TMP86:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP83]] +; CHECK-NEXT: [[TMP87:%.*]] = or <16 x i1> [[TMP84]], [[TMP85]] +; CHECK-NEXT: [[TMP88:%.*]] = or <16 x i1> [[TMP87]], [[TMP86]] +; CHECK-NEXT: [[TMP89:%.*]] = and <16 x i1> [[TMP81]], [[TMP83]] +; CHECK-NEXT: [[TMP90:%.*]] = bitcast <16 x i1> [[TMP88]] to i16 +; CHECK-NEXT: [[TMP91:%.*]] = bitcast <16 x i1> [[TMP89]] to i16 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP90]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP91]], i32 4 +; CHECK-NEXT: [[TMP92:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP93:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP94:%.*]] = and <16 x i32> [[TMP92]], [[TMP93]] +; CHECK-NEXT: [[TMP95:%.*]] = or <16 x i32> [[TMP92]], [[TMP1]] +; CHECK-NEXT: [[TMP96:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP97:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP98:%.*]] = and <16 x i32> [[TMP96]], [[TMP97]] +; CHECK-NEXT: [[TMP99:%.*]] = or <16 x i32> [[TMP96]], [[TMP2]] +; CHECK-NEXT: [[TMP100:%.*]] = icmp uge <16 x i32> [[TMP94]], [[TMP99]] +; CHECK-NEXT: [[TMP101:%.*]] = icmp uge <16 x i32> [[TMP95]], [[TMP98]] +; CHECK-NEXT: [[TMP102:%.*]] = xor <16 x i1> [[TMP100]], [[TMP101]] +; CHECK-NEXT: [[TMP103:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP104:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP105:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP106:%.*]] = and <16 x i1> [[TMP102]], [[TMP104]] +; CHECK-NEXT: [[TMP107:%.*]] = and <16 x i1> [[TMP103]], [[TMP104]] +; CHECK-NEXT: [[TMP108:%.*]] = and <16 x i1> [[TMP102]], [[TMP105]] +; CHECK-NEXT: [[TMP109:%.*]] = or <16 x i1> [[TMP106]], [[TMP107]] +; CHECK-NEXT: [[TMP110:%.*]] = or <16 x i1> [[TMP109]], [[TMP108]] +; CHECK-NEXT: [[TMP111:%.*]] = and <16 x i1> [[TMP103]], [[TMP105]] +; CHECK-NEXT: [[TMP112:%.*]] = bitcast <16 x i1> [[TMP110]] to i16 +; CHECK-NEXT: [[TMP113:%.*]] = bitcast <16 x i1> [[TMP111]] to i16 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP112]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP113]], i32 5 +; CHECK-NEXT: [[TMP114:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP115:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP116:%.*]] = and <16 x i32> [[TMP114]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = or <16 x i32> [[TMP114]], [[TMP1]] +; CHECK-NEXT: [[TMP118:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) +; CHECK-NEXT: [[TMP119:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP120:%.*]] = and <16 x i32> [[TMP118]], [[TMP119]] +; CHECK-NEXT: [[TMP121:%.*]] = or <16 x i32> [[TMP118]], [[TMP2]] +; CHECK-NEXT: [[TMP122:%.*]] = icmp ugt <16 x i32> [[TMP116]], [[TMP121]] +; CHECK-NEXT: [[TMP123:%.*]] = icmp ugt <16 x i32> [[TMP117]], [[TMP120]] +; CHECK-NEXT: [[TMP124:%.*]] = xor <16 x i1> [[TMP122]], [[TMP123]] +; CHECK-NEXT: [[TMP125:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP126:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP127:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP128:%.*]] = and <16 x i1> [[TMP124]], [[TMP126]] +; CHECK-NEXT: [[TMP129:%.*]] = and <16 x i1> [[TMP125]], [[TMP126]] +; CHECK-NEXT: [[TMP130:%.*]] = and <16 x i1> [[TMP124]], [[TMP127]] +; CHECK-NEXT: [[TMP131:%.*]] = or <16 x i1> [[TMP128]], [[TMP129]] +; CHECK-NEXT: [[TMP132:%.*]] = or <16 x i1> [[TMP131]], [[TMP130]] +; CHECK-NEXT: [[TMP133:%.*]] = and <16 x i1> [[TMP125]], [[TMP127]] +; CHECK-NEXT: [[TMP134:%.*]] = bitcast <16 x i1> [[TMP132]] to i16 +; CHECK-NEXT: [[TMP135:%.*]] = bitcast <16 x i1> [[TMP133]] to i16 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP134]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP135]], i32 6 +; CHECK-NEXT: [[TMP136:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP137:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP138:%.*]] = and <16 x i1> zeroinitializer, [[TMP136]] +; CHECK-NEXT: [[TMP139:%.*]] = and <16 x i1> splat (i1 true), [[TMP136]] +; CHECK-NEXT: [[TMP140:%.*]] = and <16 x i1> zeroinitializer, [[TMP137]] +; CHECK-NEXT: [[TMP141:%.*]] = or <16 x i1> [[TMP138]], [[TMP139]] +; CHECK-NEXT: [[TMP142:%.*]] = or <16 x i1> [[TMP141]], [[TMP140]] +; CHECK-NEXT: [[TMP143:%.*]] = and <16 x i1> splat (i1 true), [[TMP137]] +; CHECK-NEXT: [[TMP144:%.*]] = bitcast <16 x i1> [[TMP142]] to i16 +; CHECK-NEXT: [[TMP145:%.*]] = bitcast <16 x i1> [[TMP143]] to i16 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP144]], i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP145]], i32 7 +; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[VEC7]] +; + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_ucmp_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[A0]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i32> [[A1]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <16 x i32> [[TMP13]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i1> [[TMP20]] to i16 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP25:%.*]] = and <16 x i32> [[A0]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP27:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP28:%.*]] = and <16 x i32> [[A1]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ule <16 x i32> [[TMP25]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ule <16 x i32> [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP32:%.*]] = xor <16 x i1> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP34:%.*]] = bitcast <16 x i1> [[TMP32]] to i16 +; CHECK-NEXT: [[TMP35:%.*]] = bitcast <16 x i1> [[TMP33]] to i16 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP34]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP35]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3 +; CHECK-NEXT: [[TMP36:%.*]] = xor <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP37:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP38:%.*]] = icmp ne <16 x i32> [[TMP37]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = xor <16 x i32> [[TMP37]], splat (i32 -1) +; CHECK-NEXT: [[TMP40:%.*]] = and <16 x i32> [[TMP39]], [[TMP36]] +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <16 x i32> [[TMP40]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP38]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP43:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast <16 x i1> [[TMP42]] to i16 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP43]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP44]], i32 4 +; CHECK-NEXT: [[TMP45:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP46:%.*]] = and <16 x i32> [[A0]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i32> [[A1]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP51:%.*]] = icmp uge <16 x i32> [[TMP46]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = icmp uge <16 x i32> [[TMP47]], [[TMP49]] +; CHECK-NEXT: [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP55:%.*]] = bitcast <16 x i1> [[TMP53]] to i16 +; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i1> [[TMP54]] to i16 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP55]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP56]], i32 5 +; CHECK-NEXT: [[TMP57:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i32> [[A0]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP60:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP61:%.*]] = and <16 x i32> [[A1]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP63:%.*]] = icmp ugt <16 x i32> [[TMP58]], [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = icmp ugt <16 x i32> [[TMP59]], [[TMP61]] +; CHECK-NEXT: [[TMP65:%.*]] = xor <16 x i1> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP67:%.*]] = bitcast <16 x i1> [[TMP65]] to i16 +; CHECK-NEXT: [[TMP68:%.*]] = bitcast <16 x i1> [[TMP66]] to i16 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP67]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP68]], i32 6 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7 +; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[VEC7]] +; + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_ucmp_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1) +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i32> [[A0]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP25:%.*]] = and <16 x i32> [[A1]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP22]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP29:%.*]] = xor <16 x i1> [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = and <16 x i1> [[TMP29]], [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i1> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i1> [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = or <16 x i1> [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP37:%.*]] = or <16 x i1> [[TMP36]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]] +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16 +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP39]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP40]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP42:%.*]] = and <16 x i32> [[A0]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP44:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP45:%.*]] = and <16 x i32> [[A1]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP47:%.*]] = icmp ule <16 x i32> [[TMP42]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = icmp ule <16 x i32> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP49:%.*]] = xor <16 x i1> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP51:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP52:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP53:%.*]] = and <16 x i1> [[TMP49]], [[TMP51]] +; CHECK-NEXT: [[TMP54:%.*]] = and <16 x i1> [[TMP50]], [[TMP51]] +; CHECK-NEXT: [[TMP55:%.*]] = and <16 x i1> [[TMP49]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = or <16 x i1> [[TMP53]], [[TMP54]] +; CHECK-NEXT: [[TMP57:%.*]] = or <16 x i1> [[TMP56]], [[TMP55]] +; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP59:%.*]] = bitcast <16 x i1> [[TMP57]] to i16 +; CHECK-NEXT: [[TMP60:%.*]] = bitcast <16 x i1> [[TMP58]] to i16 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP59]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP60]], i32 2 +; CHECK-NEXT: [[TMP61:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP63:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]] +; CHECK-NEXT: [[TMP64:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]] +; CHECK-NEXT: [[TMP65:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]] +; CHECK-NEXT: [[TMP66:%.*]] = or <16 x i1> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = or <16 x i1> [[TMP66]], [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]] +; CHECK-NEXT: [[TMP69:%.*]] = bitcast <16 x i1> [[TMP67]] to i16 +; CHECK-NEXT: [[TMP70:%.*]] = bitcast <16 x i1> [[TMP68]] to i16 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP69]], i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP70]], i32 3 +; CHECK-NEXT: [[TMP71:%.*]] = xor <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP72:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP73:%.*]] = icmp ne <16 x i32> [[TMP72]], zeroinitializer +; CHECK-NEXT: [[TMP74:%.*]] = xor <16 x i32> [[TMP72]], splat (i32 -1) +; CHECK-NEXT: [[TMP75:%.*]] = and <16 x i32> [[TMP74]], [[TMP71]] +; CHECK-NEXT: [[TMP76:%.*]] = icmp eq <16 x i32> [[TMP75]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP73]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP78:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP79:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP80:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP78]] +; CHECK-NEXT: [[TMP81:%.*]] = and <16 x i1> [[TMP77]], [[TMP78]] +; CHECK-NEXT: [[TMP82:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP79]] +; CHECK-NEXT: [[TMP83:%.*]] = or <16 x i1> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[TMP84:%.*]] = or <16 x i1> [[TMP83]], [[TMP82]] +; CHECK-NEXT: [[TMP85:%.*]] = and <16 x i1> [[TMP77]], [[TMP79]] +; CHECK-NEXT: [[TMP86:%.*]] = bitcast <16 x i1> [[TMP84]] to i16 +; CHECK-NEXT: [[TMP87:%.*]] = bitcast <16 x i1> [[TMP85]] to i16 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP86]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP87]], i32 4 +; CHECK-NEXT: [[TMP88:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP89:%.*]] = and <16 x i32> [[A0]], [[TMP88]] +; CHECK-NEXT: [[TMP90:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP91:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP92:%.*]] = and <16 x i32> [[A1]], [[TMP91]] +; CHECK-NEXT: [[TMP93:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP94:%.*]] = icmp uge <16 x i32> [[TMP89]], [[TMP93]] +; CHECK-NEXT: [[TMP95:%.*]] = icmp uge <16 x i32> [[TMP90]], [[TMP92]] +; CHECK-NEXT: [[TMP96:%.*]] = xor <16 x i1> [[TMP94]], [[TMP95]] +; CHECK-NEXT: [[TMP97:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP98:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP99:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP100:%.*]] = and <16 x i1> [[TMP96]], [[TMP98]] +; CHECK-NEXT: [[TMP101:%.*]] = and <16 x i1> [[TMP97]], [[TMP98]] +; CHECK-NEXT: [[TMP102:%.*]] = and <16 x i1> [[TMP96]], [[TMP99]] +; CHECK-NEXT: [[TMP103:%.*]] = or <16 x i1> [[TMP100]], [[TMP101]] +; CHECK-NEXT: [[TMP104:%.*]] = or <16 x i1> [[TMP103]], [[TMP102]] +; CHECK-NEXT: [[TMP105:%.*]] = and <16 x i1> [[TMP97]], [[TMP99]] +; CHECK-NEXT: [[TMP106:%.*]] = bitcast <16 x i1> [[TMP104]] to i16 +; CHECK-NEXT: [[TMP107:%.*]] = bitcast <16 x i1> [[TMP105]] to i16 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP106]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP107]], i32 5 +; CHECK-NEXT: [[TMP108:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) +; CHECK-NEXT: [[TMP109:%.*]] = and <16 x i32> [[A0]], [[TMP108]] +; CHECK-NEXT: [[TMP110:%.*]] = or <16 x i32> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP111:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) +; CHECK-NEXT: [[TMP112:%.*]] = and <16 x i32> [[A1]], [[TMP111]] +; CHECK-NEXT: [[TMP113:%.*]] = or <16 x i32> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP114:%.*]] = icmp ugt <16 x i32> [[TMP109]], [[TMP113]] +; CHECK-NEXT: [[TMP115:%.*]] = icmp ugt <16 x i32> [[TMP110]], [[TMP112]] +; CHECK-NEXT: [[TMP116:%.*]] = xor <16 x i1> [[TMP114]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP118:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP119:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP120:%.*]] = and <16 x i1> [[TMP116]], [[TMP118]] +; CHECK-NEXT: [[TMP121:%.*]] = and <16 x i1> [[TMP117]], [[TMP118]] +; CHECK-NEXT: [[TMP122:%.*]] = and <16 x i1> [[TMP116]], [[TMP119]] +; CHECK-NEXT: [[TMP123:%.*]] = or <16 x i1> [[TMP120]], [[TMP121]] +; CHECK-NEXT: [[TMP124:%.*]] = or <16 x i1> [[TMP123]], [[TMP122]] +; CHECK-NEXT: [[TMP125:%.*]] = and <16 x i1> [[TMP117]], [[TMP119]] +; CHECK-NEXT: [[TMP126:%.*]] = bitcast <16 x i1> [[TMP124]] to i16 +; CHECK-NEXT: [[TMP127:%.*]] = bitcast <16 x i1> [[TMP125]] to i16 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP126]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP127]], i32 6 +; CHECK-NEXT: [[TMP128:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP129:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP130:%.*]] = and <16 x i1> zeroinitializer, [[TMP128]] +; CHECK-NEXT: [[TMP131:%.*]] = and <16 x i1> splat (i1 true), [[TMP128]] +; CHECK-NEXT: [[TMP132:%.*]] = and <16 x i1> zeroinitializer, [[TMP129]] +; CHECK-NEXT: [[TMP133:%.*]] = or <16 x i1> [[TMP130]], [[TMP131]] +; CHECK-NEXT: [[TMP134:%.*]] = or <16 x i1> [[TMP133]], [[TMP132]] +; CHECK-NEXT: [[TMP135:%.*]] = and <16 x i1> splat (i1 true), [[TMP129]] +; CHECK-NEXT: [[TMP136:%.*]] = bitcast <16 x i1> [[TMP134]] to i16 +; CHECK-NEXT: [[TMP137:%.*]] = bitcast <16 x i1> [[TMP135]] to i16 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP136]], i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP137]], i32 7 +; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[VEC7]] +; + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone + +define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_cmp_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP16]], [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <8 x i64> [[TMP15]], [[TMP18]] +; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i1> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP24]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP27:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP28:%.*]] = and <8 x i64> [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[TMP26]], [[TMP1]] +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP32:%.*]] = and <8 x i64> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP30]], [[TMP2]] +; CHECK-NEXT: [[TMP34:%.*]] = icmp ule <8 x i64> [[TMP28]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp ule <8 x i64> [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = xor <8 x i1> [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i1> [[TMP36]] to i8 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP38]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP39]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3 +; CHECK-NEXT: [[TMP40:%.*]] = xor <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP41:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i64> [[TMP41]], zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = xor <8 x i64> [[TMP41]], splat (i64 -1) +; CHECK-NEXT: [[TMP44:%.*]] = and <8 x i64> [[TMP43]], [[TMP40]] +; CHECK-NEXT: [[TMP45:%.*]] = icmp eq <8 x i64> [[TMP44]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP42]], [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP47:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8 +; CHECK-NEXT: [[TMP48:%.*]] = bitcast <8 x i1> [[TMP46]] to i8 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP47]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP48]], i32 4 +; CHECK-NEXT: [[TMP49:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP50:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP51:%.*]] = and <8 x i64> [[TMP49]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = or <8 x i64> [[TMP49]], [[TMP1]] +; CHECK-NEXT: [[TMP53:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP54:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP55:%.*]] = and <8 x i64> [[TMP53]], [[TMP54]] +; CHECK-NEXT: [[TMP56:%.*]] = or <8 x i64> [[TMP53]], [[TMP2]] +; CHECK-NEXT: [[TMP57:%.*]] = icmp uge <8 x i64> [[TMP51]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = icmp uge <8 x i64> [[TMP52]], [[TMP55]] +; CHECK-NEXT: [[TMP59:%.*]] = xor <8 x i1> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP61:%.*]] = bitcast <8 x i1> [[TMP59]] to i8 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast <8 x i1> [[TMP60]] to i8 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP61]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP62]], i32 5 +; CHECK-NEXT: [[TMP63:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP64:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP65:%.*]] = and <8 x i64> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = or <8 x i64> [[TMP63]], [[TMP1]] +; CHECK-NEXT: [[TMP67:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP68:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP69:%.*]] = and <8 x i64> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = or <8 x i64> [[TMP67]], [[TMP2]] +; CHECK-NEXT: [[TMP71:%.*]] = icmp ugt <8 x i64> [[TMP65]], [[TMP70]] +; CHECK-NEXT: [[TMP72:%.*]] = icmp ugt <8 x i64> [[TMP66]], [[TMP69]] +; CHECK-NEXT: [[TMP73:%.*]] = xor <8 x i1> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP75:%.*]] = bitcast <8 x i1> [[TMP73]] to i8 +; CHECK-NEXT: [[TMP76:%.*]] = bitcast <8 x i1> [[TMP74]] to i8 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP75]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP76]], i32 6 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7 +; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[VEC7]] +; + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_cmp_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1) +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP1]] +; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP26:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i64> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = or <8 x i64> [[TMP25]], [[TMP2]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <8 x i64> [[TMP24]], [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP36:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = and <8 x i1> [[TMP31]], [[TMP34]] +; CHECK-NEXT: [[TMP38:%.*]] = or <8 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = or <8 x i1> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = and <8 x i1> [[TMP32]], [[TMP34]] +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i1> [[TMP40]] to i8 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP41]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP42]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP44:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP45:%.*]] = and <8 x i64> [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or <8 x i64> [[TMP43]], [[TMP1]] +; CHECK-NEXT: [[TMP47:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP49:%.*]] = and <8 x i64> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = or <8 x i64> [[TMP47]], [[TMP2]] +; CHECK-NEXT: [[TMP51:%.*]] = icmp ule <8 x i64> [[TMP45]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = icmp ule <8 x i64> [[TMP46]], [[TMP49]] +; CHECK-NEXT: [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP55:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP56:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP57:%.*]] = and <8 x i1> [[TMP53]], [[TMP55]] +; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i1> [[TMP54]], [[TMP55]] +; CHECK-NEXT: [[TMP59:%.*]] = and <8 x i1> [[TMP53]], [[TMP56]] +; CHECK-NEXT: [[TMP60:%.*]] = or <8 x i1> [[TMP57]], [[TMP58]] +; CHECK-NEXT: [[TMP61:%.*]] = or <8 x i1> [[TMP60]], [[TMP59]] +; CHECK-NEXT: [[TMP62:%.*]] = and <8 x i1> [[TMP54]], [[TMP56]] +; CHECK-NEXT: [[TMP63:%.*]] = bitcast <8 x i1> [[TMP61]] to i8 +; CHECK-NEXT: [[TMP64:%.*]] = bitcast <8 x i1> [[TMP62]] to i8 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP63]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP64]], i32 2 +; CHECK-NEXT: [[TMP65:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP67:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]] +; CHECK-NEXT: [[TMP69:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]] +; CHECK-NEXT: [[TMP70:%.*]] = or <8 x i1> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = or <8 x i1> [[TMP70]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]] +; CHECK-NEXT: [[TMP73:%.*]] = bitcast <8 x i1> [[TMP71]] to i8 +; CHECK-NEXT: [[TMP74:%.*]] = bitcast <8 x i1> [[TMP72]] to i8 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP73]], i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP74]], i32 3 +; CHECK-NEXT: [[TMP75:%.*]] = xor <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP76:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <8 x i64> [[TMP76]], zeroinitializer +; CHECK-NEXT: [[TMP78:%.*]] = xor <8 x i64> [[TMP76]], splat (i64 -1) +; CHECK-NEXT: [[TMP79:%.*]] = and <8 x i64> [[TMP78]], [[TMP75]] +; CHECK-NEXT: [[TMP80:%.*]] = icmp eq <8 x i64> [[TMP79]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP77]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP82:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP83:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP84:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP82]] +; CHECK-NEXT: [[TMP85:%.*]] = and <8 x i1> [[TMP81]], [[TMP82]] +; CHECK-NEXT: [[TMP86:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP83]] +; CHECK-NEXT: [[TMP87:%.*]] = or <8 x i1> [[TMP84]], [[TMP85]] +; CHECK-NEXT: [[TMP88:%.*]] = or <8 x i1> [[TMP87]], [[TMP86]] +; CHECK-NEXT: [[TMP89:%.*]] = and <8 x i1> [[TMP81]], [[TMP83]] +; CHECK-NEXT: [[TMP90:%.*]] = bitcast <8 x i1> [[TMP88]] to i8 +; CHECK-NEXT: [[TMP91:%.*]] = bitcast <8 x i1> [[TMP89]] to i8 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP90]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP91]], i32 4 +; CHECK-NEXT: [[TMP92:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP93:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP94:%.*]] = and <8 x i64> [[TMP92]], [[TMP93]] +; CHECK-NEXT: [[TMP95:%.*]] = or <8 x i64> [[TMP92]], [[TMP1]] +; CHECK-NEXT: [[TMP96:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP97:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP98:%.*]] = and <8 x i64> [[TMP96]], [[TMP97]] +; CHECK-NEXT: [[TMP99:%.*]] = or <8 x i64> [[TMP96]], [[TMP2]] +; CHECK-NEXT: [[TMP100:%.*]] = icmp uge <8 x i64> [[TMP94]], [[TMP99]] +; CHECK-NEXT: [[TMP101:%.*]] = icmp uge <8 x i64> [[TMP95]], [[TMP98]] +; CHECK-NEXT: [[TMP102:%.*]] = xor <8 x i1> [[TMP100]], [[TMP101]] +; CHECK-NEXT: [[TMP103:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP104:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP105:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP106:%.*]] = and <8 x i1> [[TMP102]], [[TMP104]] +; CHECK-NEXT: [[TMP107:%.*]] = and <8 x i1> [[TMP103]], [[TMP104]] +; CHECK-NEXT: [[TMP108:%.*]] = and <8 x i1> [[TMP102]], [[TMP105]] +; CHECK-NEXT: [[TMP109:%.*]] = or <8 x i1> [[TMP106]], [[TMP107]] +; CHECK-NEXT: [[TMP110:%.*]] = or <8 x i1> [[TMP109]], [[TMP108]] +; CHECK-NEXT: [[TMP111:%.*]] = and <8 x i1> [[TMP103]], [[TMP105]] +; CHECK-NEXT: [[TMP112:%.*]] = bitcast <8 x i1> [[TMP110]] to i8 +; CHECK-NEXT: [[TMP113:%.*]] = bitcast <8 x i1> [[TMP111]] to i8 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP112]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP113]], i32 5 +; CHECK-NEXT: [[TMP114:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP115:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP116:%.*]] = and <8 x i64> [[TMP114]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = or <8 x i64> [[TMP114]], [[TMP1]] +; CHECK-NEXT: [[TMP118:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) +; CHECK-NEXT: [[TMP119:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP120:%.*]] = and <8 x i64> [[TMP118]], [[TMP119]] +; CHECK-NEXT: [[TMP121:%.*]] = or <8 x i64> [[TMP118]], [[TMP2]] +; CHECK-NEXT: [[TMP122:%.*]] = icmp ugt <8 x i64> [[TMP116]], [[TMP121]] +; CHECK-NEXT: [[TMP123:%.*]] = icmp ugt <8 x i64> [[TMP117]], [[TMP120]] +; CHECK-NEXT: [[TMP124:%.*]] = xor <8 x i1> [[TMP122]], [[TMP123]] +; CHECK-NEXT: [[TMP125:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP126:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP127:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP128:%.*]] = and <8 x i1> [[TMP124]], [[TMP126]] +; CHECK-NEXT: [[TMP129:%.*]] = and <8 x i1> [[TMP125]], [[TMP126]] +; CHECK-NEXT: [[TMP130:%.*]] = and <8 x i1> [[TMP124]], [[TMP127]] +; CHECK-NEXT: [[TMP131:%.*]] = or <8 x i1> [[TMP128]], [[TMP129]] +; CHECK-NEXT: [[TMP132:%.*]] = or <8 x i1> [[TMP131]], [[TMP130]] +; CHECK-NEXT: [[TMP133:%.*]] = and <8 x i1> [[TMP125]], [[TMP127]] +; CHECK-NEXT: [[TMP134:%.*]] = bitcast <8 x i1> [[TMP132]] to i8 +; CHECK-NEXT: [[TMP135:%.*]] = bitcast <8 x i1> [[TMP133]] to i8 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP134]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP135]], i32 6 +; CHECK-NEXT: [[TMP136:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP137:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP138:%.*]] = and <8 x i1> zeroinitializer, [[TMP136]] +; CHECK-NEXT: [[TMP139:%.*]] = and <8 x i1> splat (i1 true), [[TMP136]] +; CHECK-NEXT: [[TMP140:%.*]] = and <8 x i1> zeroinitializer, [[TMP137]] +; CHECK-NEXT: [[TMP141:%.*]] = or <8 x i1> [[TMP138]], [[TMP139]] +; CHECK-NEXT: [[TMP142:%.*]] = or <8 x i1> [[TMP141]], [[TMP140]] +; CHECK-NEXT: [[TMP143:%.*]] = and <8 x i1> splat (i1 true), [[TMP137]] +; CHECK-NEXT: [[TMP144:%.*]] = bitcast <8 x i1> [[TMP142]] to i8 +; CHECK-NEXT: [[TMP145:%.*]] = bitcast <8 x i1> [[TMP143]] to i8 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP144]], i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP145]], i32 7 +; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[VEC7]] +; + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_ucmp_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1) +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[A0]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[A1]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <8 x i64> [[TMP13]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i1> [[TMP20]] to i8 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP21]] to i8 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP22]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[A0]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP27:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP28:%.*]] = and <8 x i64> [[A1]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ule <8 x i64> [[TMP25]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ule <8 x i64> [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP32:%.*]] = xor <8 x i1> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x i1> [[TMP32]] to i8 +; CHECK-NEXT: [[TMP35:%.*]] = bitcast <8 x i1> [[TMP33]] to i8 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP34]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP35]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3 +; CHECK-NEXT: [[TMP36:%.*]] = xor <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP37:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP38:%.*]] = icmp ne <8 x i64> [[TMP37]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = xor <8 x i64> [[TMP37]], splat (i64 -1) +; CHECK-NEXT: [[TMP40:%.*]] = and <8 x i64> [[TMP39]], [[TMP36]] +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <8 x i64> [[TMP40]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP38]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP43:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast <8 x i1> [[TMP42]] to i8 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP43]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP44]], i32 4 +; CHECK-NEXT: [[TMP45:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP46:%.*]] = and <8 x i64> [[A0]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP49:%.*]] = and <8 x i64> [[A1]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP51:%.*]] = icmp uge <8 x i64> [[TMP46]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = icmp uge <8 x i64> [[TMP47]], [[TMP49]] +; CHECK-NEXT: [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP55:%.*]] = bitcast <8 x i1> [[TMP53]] to i8 +; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i1> [[TMP54]] to i8 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP55]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP56]], i32 5 +; CHECK-NEXT: [[TMP57:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i64> [[A0]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP60:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP61:%.*]] = and <8 x i64> [[A1]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP63:%.*]] = icmp ugt <8 x i64> [[TMP58]], [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = icmp ugt <8 x i64> [[TMP59]], [[TMP61]] +; CHECK-NEXT: [[TMP65:%.*]] = xor <8 x i1> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP67:%.*]] = bitcast <8 x i1> [[TMP65]] to i8 +; CHECK-NEXT: [[TMP68:%.*]] = bitcast <8 x i1> [[TMP66]] to i8 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP67]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP68]], i32 6 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7 +; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[VEC7]] +; + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_ucmp_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1) +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0 +; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[A0]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[A1]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ult <8 x i64> [[TMP22]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = and <8 x i1> [[TMP29]], [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = or <8 x i1> [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP37:%.*]] = or <8 x i1> [[TMP36]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = and <8 x i1> [[TMP30]], [[TMP32]] +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8 +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP39]], i32 1 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP40]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP42:%.*]] = and <8 x i64> [[A0]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP44:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP45:%.*]] = and <8 x i64> [[A1]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP47:%.*]] = icmp ule <8 x i64> [[TMP42]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = icmp ule <8 x i64> [[TMP43]], [[TMP45]] +; CHECK-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP51:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP52:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP51]] +; CHECK-NEXT: [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP51]] +; CHECK-NEXT: [[TMP55:%.*]] = and <8 x i1> [[TMP49]], [[TMP52]] +; CHECK-NEXT: [[TMP56:%.*]] = or <8 x i1> [[TMP53]], [[TMP54]] +; CHECK-NEXT: [[TMP57:%.*]] = or <8 x i1> [[TMP56]], [[TMP55]] +; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i1> [[TMP50]], [[TMP52]] +; CHECK-NEXT: [[TMP59:%.*]] = bitcast <8 x i1> [[TMP57]] to i8 +; CHECK-NEXT: [[TMP60:%.*]] = bitcast <8 x i1> [[TMP58]] to i8 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP59]], i32 2 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP60]], i32 2 +; CHECK-NEXT: [[TMP61:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP63:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]] +; CHECK-NEXT: [[TMP64:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]] +; CHECK-NEXT: [[TMP65:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]] +; CHECK-NEXT: [[TMP66:%.*]] = or <8 x i1> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = or <8 x i1> [[TMP66]], [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]] +; CHECK-NEXT: [[TMP69:%.*]] = bitcast <8 x i1> [[TMP67]] to i8 +; CHECK-NEXT: [[TMP70:%.*]] = bitcast <8 x i1> [[TMP68]] to i8 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP69]], i32 3 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP70]], i32 3 +; CHECK-NEXT: [[TMP71:%.*]] = xor <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP72:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP73:%.*]] = icmp ne <8 x i64> [[TMP72]], zeroinitializer +; CHECK-NEXT: [[TMP74:%.*]] = xor <8 x i64> [[TMP72]], splat (i64 -1) +; CHECK-NEXT: [[TMP75:%.*]] = and <8 x i64> [[TMP74]], [[TMP71]] +; CHECK-NEXT: [[TMP76:%.*]] = icmp eq <8 x i64> [[TMP75]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP73]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP78:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP79:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP80:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP78]] +; CHECK-NEXT: [[TMP81:%.*]] = and <8 x i1> [[TMP77]], [[TMP78]] +; CHECK-NEXT: [[TMP82:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP79]] +; CHECK-NEXT: [[TMP83:%.*]] = or <8 x i1> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[TMP84:%.*]] = or <8 x i1> [[TMP83]], [[TMP82]] +; CHECK-NEXT: [[TMP85:%.*]] = and <8 x i1> [[TMP77]], [[TMP79]] +; CHECK-NEXT: [[TMP86:%.*]] = bitcast <8 x i1> [[TMP84]] to i8 +; CHECK-NEXT: [[TMP87:%.*]] = bitcast <8 x i1> [[TMP85]] to i8 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP86]], i32 4 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP87]], i32 4 +; CHECK-NEXT: [[TMP88:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP89:%.*]] = and <8 x i64> [[A0]], [[TMP88]] +; CHECK-NEXT: [[TMP90:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP91:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP92:%.*]] = and <8 x i64> [[A1]], [[TMP91]] +; CHECK-NEXT: [[TMP93:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP94:%.*]] = icmp uge <8 x i64> [[TMP89]], [[TMP93]] +; CHECK-NEXT: [[TMP95:%.*]] = icmp uge <8 x i64> [[TMP90]], [[TMP92]] +; CHECK-NEXT: [[TMP96:%.*]] = xor <8 x i1> [[TMP94]], [[TMP95]] +; CHECK-NEXT: [[TMP97:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP98:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP99:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP100:%.*]] = and <8 x i1> [[TMP96]], [[TMP98]] +; CHECK-NEXT: [[TMP101:%.*]] = and <8 x i1> [[TMP97]], [[TMP98]] +; CHECK-NEXT: [[TMP102:%.*]] = and <8 x i1> [[TMP96]], [[TMP99]] +; CHECK-NEXT: [[TMP103:%.*]] = or <8 x i1> [[TMP100]], [[TMP101]] +; CHECK-NEXT: [[TMP104:%.*]] = or <8 x i1> [[TMP103]], [[TMP102]] +; CHECK-NEXT: [[TMP105:%.*]] = and <8 x i1> [[TMP97]], [[TMP99]] +; CHECK-NEXT: [[TMP106:%.*]] = bitcast <8 x i1> [[TMP104]] to i8 +; CHECK-NEXT: [[TMP107:%.*]] = bitcast <8 x i1> [[TMP105]] to i8 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP106]], i32 5 +; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP107]], i32 5 +; CHECK-NEXT: [[TMP108:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) +; CHECK-NEXT: [[TMP109:%.*]] = and <8 x i64> [[A0]], [[TMP108]] +; CHECK-NEXT: [[TMP110:%.*]] = or <8 x i64> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP111:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) +; CHECK-NEXT: [[TMP112:%.*]] = and <8 x i64> [[A1]], [[TMP111]] +; CHECK-NEXT: [[TMP113:%.*]] = or <8 x i64> [[A1]], [[TMP2]] +; CHECK-NEXT: [[TMP114:%.*]] = icmp ugt <8 x i64> [[TMP109]], [[TMP113]] +; CHECK-NEXT: [[TMP115:%.*]] = icmp ugt <8 x i64> [[TMP110]], [[TMP112]] +; CHECK-NEXT: [[TMP116:%.*]] = xor <8 x i1> [[TMP114]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP118:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP119:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP120:%.*]] = and <8 x i1> [[TMP116]], [[TMP118]] +; CHECK-NEXT: [[TMP121:%.*]] = and <8 x i1> [[TMP117]], [[TMP118]] +; CHECK-NEXT: [[TMP122:%.*]] = and <8 x i1> [[TMP116]], [[TMP119]] +; CHECK-NEXT: [[TMP123:%.*]] = or <8 x i1> [[TMP120]], [[TMP121]] +; CHECK-NEXT: [[TMP124:%.*]] = or <8 x i1> [[TMP123]], [[TMP122]] +; CHECK-NEXT: [[TMP125:%.*]] = and <8 x i1> [[TMP117]], [[TMP119]] +; CHECK-NEXT: [[TMP126:%.*]] = bitcast <8 x i1> [[TMP124]] to i8 +; CHECK-NEXT: [[TMP127:%.*]] = bitcast <8 x i1> [[TMP125]] to i8 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP126]], i32 6 +; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP127]], i32 6 +; CHECK-NEXT: [[TMP128:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP129:%.*]] = bitcast i8 [[MASK]] to <8 x i1> +; CHECK-NEXT: [[TMP130:%.*]] = and <8 x i1> zeroinitializer, [[TMP128]] +; CHECK-NEXT: [[TMP131:%.*]] = and <8 x i1> splat (i1 true), [[TMP128]] +; CHECK-NEXT: [[TMP132:%.*]] = and <8 x i1> zeroinitializer, [[TMP129]] +; CHECK-NEXT: [[TMP133:%.*]] = or <8 x i1> [[TMP130]], [[TMP131]] +; CHECK-NEXT: [[TMP134:%.*]] = or <8 x i1> [[TMP133]], [[TMP132]] +; CHECK-NEXT: [[TMP135:%.*]] = and <8 x i1> splat (i1 true), [[TMP129]] +; CHECK-NEXT: [[TMP136:%.*]] = bitcast <8 x i1> [[TMP134]] to i8 +; CHECK-NEXT: [[TMP137:%.*]] = bitcast <8 x i1> [[TMP135]] to i8 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP136]], i32 7 +; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP137]], i32 7 +; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[VEC7]] +; + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x float> [[TMP15]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP22]], <16 x i32> [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP17]], <16 x float> [[TMP15]], <16 x float> zeroinitializer +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSPROP_SELECT]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP4]], [[TMP14]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <16 x i32> [[_MSPROP_SELECT3]], [[_MSPROP4]] +; CHECK-NEXT: [[RES5:%.*]] = fadd <16 x float> [[TMP23]], [[RES4]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES5]] +; + %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) + %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res1, %res2 + %res5 = fadd <16 x float> %res3, %res4 + ret <16 x float> %res5 +} + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(ptr %x0ptr, <16 x float> %x2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X0:%.*]] = load <4 x float>, ptr [[X0PTR:%.*]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %x0 = load <4 x float>, ptr %x0ptr + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_broadcastf64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcastf64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr, <8 x double> %x2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X0:%.*]] = load <4 x double>, ptr [[X0PTR:%.*]], align 32 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[_MSPROP]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP18]] +; + %x0 = load <4 x double>, ptr %x0ptr + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[MASK]] to <16 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP19]], <16 x i32> [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[TMP13]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP4]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP21]], <16 x i32> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP12]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP22]], <16 x i32> [[_MSPROP_SELECT3]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP20]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP23]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(ptr %x0ptr, <16 x i32> %x2, i16 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X0:%.*]] = load <4 x i32>, ptr [[X0PTR:%.*]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %x0 = load <4 x i32>, ptr %x0ptr + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_broadcasti64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcasti64x4_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP10]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8 x i64> %x2, i8 %mask) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X0:%.*]] = load <4 x i64>, ptr [[X0PTR:%.*]], align 32 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32 +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[X0]], <4 x i64> [[X0]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %x0 = load <4 x i64>, ptr %x0ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pabs_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false) +; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pabs_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false) +; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) #0 { +; +; CHECK-LABEL: @test_vptestmq( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[A0:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[A1:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1) +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i1> [[TMP16]] to i8 +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[A0]], [[TMP2]] +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP1]], [[A1]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = icmp ne <8 x i64> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i64> [[TMP26]], splat (i64 -1) +; CHECK-NEXT: [[TMP29:%.*]] = and <8 x i64> [[TMP28]], [[TMP25]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <8 x i64> [[TMP29]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP27]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[M:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = or <8 x i1> [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = or <8 x i1> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8 +; CHECK-NEXT: [[_MSPROP:%.*]] = or i8 [[TMP40]], [[TMP17]] +; CHECK-NEXT: [[RES2:%.*]] = add i8 [[TMP41]], [[TMP18]] +; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[RES2]] +; + %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) + %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) + %res2 = add i8 %res1, %res + ret i8 %res2 +} +declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8) + +define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) #0 { +; +; CHECK-LABEL: @test_vptestmd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[A0:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[A1:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1) +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i1> [[TMP16]] to i16 +; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i32> [[A0]], [[TMP2]] +; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i32> [[TMP1]], [[A1]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i32> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i32> [[A0]], [[A1]] +; CHECK-NEXT: [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = icmp ne <16 x i32> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i32> [[TMP26]], splat (i32 -1) +; CHECK-NEXT: [[TMP29:%.*]] = and <16 x i32> [[TMP28]], [[TMP25]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <16 x i32> [[TMP29]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP27]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[M:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP32]] +; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP32]] +; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = or <16 x i1> [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = or <16 x i1> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or i16 [[TMP40]], [[TMP17]] +; CHECK-NEXT: [[RES2:%.*]] = add i16 [[TMP41]], [[TMP18]] +; CHECK-NEXT: store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[RES2]] +; + %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) + %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) + %res2 = add i16 %res1, %res + ret i16 %res2 +} +declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16) + +declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2) + +define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_ptestnm_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[X0:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[X1:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[X0]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1) +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP18]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i1> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16 +; CHECK-NEXT: [[TMP27:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = and <16 x i32> [[X0]], [[TMP2]] +; CHECK-NEXT: [[TMP29:%.*]] = and <16 x i32> [[TMP1]], [[X1]] +; CHECK-NEXT: [[TMP30:%.*]] = or <16 x i32> [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = or <16 x i32> [[TMP30]], [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = and <16 x i32> [[X0]], [[X1]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <16 x i32> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = or <16 x i32> [[TMP31]], zeroinitializer +; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i32> [[TMP34]], zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = xor <16 x i32> [[TMP34]], splat (i32 -1) +; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i32> [[TMP36]], [[TMP33]] +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq <16 x i32> [[TMP37]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq <16 x i32> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16 +; CHECK-NEXT: [[_MSPROP:%.*]] = or i16 [[TMP25]], [[TMP40]] +; CHECK-NEXT: [[RES2:%.*]] = add i16 [[TMP26]], [[TMP41]] +; CHECK-NEXT: store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[RES2]] +; + %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) + %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) + %res2 = add i16 %res, %res1 + ret i16 %res2 +} + +declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2) + +define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_ptestnm_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[X0:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[X1:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[X0]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1) +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i1> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP18]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i1> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <8 x i1> [[TMP24]] to i8 +; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = and <8 x i64> [[X0]], [[TMP2]] +; CHECK-NEXT: [[TMP29:%.*]] = and <8 x i64> [[TMP1]], [[X1]] +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = and <8 x i64> [[X0]], [[X1]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <8 x i64> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer +; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i64> [[TMP34]], zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = xor <8 x i64> [[TMP34]], splat (i64 -1) +; CHECK-NEXT: [[TMP37:%.*]] = and <8 x i64> [[TMP36]], [[TMP33]] +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq <8 x i64> [[TMP37]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP35]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq <8 x i64> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP1]] to i8 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8 +; CHECK-NEXT: [[_MSPROP:%.*]] = or i8 [[TMP25]], [[TMP40]] +; CHECK-NEXT: [[RES2:%.*]] = add i8 [[TMP26]], [[TMP41]] +; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[RES2]] +; + %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) + %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) + %res2 = add i8 %res, %res1 + ret i8 %res2 +} + +declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone +define i16 @test_kand(i16 %a0, i16 %a1) #0 { +; +; CHECK-LABEL: @test_kand( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i1> [[TMP3]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i1> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP11]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i1> [[TMP13]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[TMP13]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = or <16 x i1> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16 +; CHECK-NEXT: store i16 [[TMP23]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP24]] +; + %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone +define i16 @test_kandn(i16 %a0, i16 %a1) #0 { +; +; CHECK-LABEL: @test_kandn( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[_MSPROP]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i1> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i1> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16 +; CHECK-NEXT: store i16 [[TMP25]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP26]] +; + %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone +define i16 @test_knot(i16 %a0) #0 { +; +; CHECK-LABEL: @test_knot( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: store i16 [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP6]] +; + %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone +define i16 @test_kor(i16 %a0, i16 %a1) #0 { +; +; CHECK-LABEL: @test_kor( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP3]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 0), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 1), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 2), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 3), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 4), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 5), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 6), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 7), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 8), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 9), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 10), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 11), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 12), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 13), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 14), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 15), i1 true)> +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i1> [[TMP17]], splat (i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i1> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i1> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i1> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i1> [[TMP25]] to i16 +; CHECK-NEXT: store i16 [[TMP26]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP27]] +; + %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone +; TODO: the two kxnor instructions here a no op and should be elimintaed, +; probably by FoldConstantArithmetic in SelectionDAG. +define i16 @test_kxnor(i16 %a0, i16 %a1) #0 { +; +; CHECK-LABEL: @test_kxnor( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i1> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP6]] to i16 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP8]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i1> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[TMP10]], splat (i1 true) +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i1> [[_MSPROP2]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i1> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i1> [[_MSPROP3]] to i16 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 +; CHECK-NEXT: store i16 [[TMP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP16]] +; + %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone +define i16 @test_kxor(i16 %a0, i16 %a1) #0 { +; +; CHECK-LABEL: @test_kxor( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i1> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i1> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP12]] to i16 +; CHECK-NEXT: store i16 [[TMP13]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP14]] +; + %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone +define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) #0 { +; CHECK-LABEL: @test_kortestz( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1) +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1) +; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1> +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true) +; CHECK-NEXT: [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true) +; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]] +; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]] +; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]] +; CHECK-NEXT: [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16 +; CHECK-NEXT: [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16 +; CHECK-NEXT: [[TMP44:%.*]] = xor i16 [[TMP43]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = or i16 [[TMP42]], 0 +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = xor i16 [[TMP45]], -1 +; CHECK-NEXT: [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]] +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0 +; CHECK-NEXT: [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32 +; CHECK-NEXT: [[TMP51:%.*]] = zext i1 [[TMP50]] to i32 +; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[TMP51]] +; +entry: + %0 = bitcast <8 x i64> %A to <16 x i32> + %1 = bitcast <8 x i64> %B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <8 x i64> %C to <16 x i32> + %4 = bitcast <8 x i64> %D to <16 x i32> + %5 = icmp ne <16 x i32> %3, %4 + %6 = bitcast <16 x i1> %2 to i16 + %7 = bitcast <16 x i1> %5 to i16 + %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone +define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) #0 { +; CHECK-LABEL: @test_kortestc( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1) +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1) +; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1> +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1> +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true) +; CHECK-NEXT: [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true) +; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]] +; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]] +; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]] +; CHECK-NEXT: [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]] +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16 +; CHECK-NEXT: [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16 +; CHECK-NEXT: [[TMP44:%.*]] = xor i16 [[TMP43]], 0 +; CHECK-NEXT: [[TMP45:%.*]] = or i16 [[TMP42]], 0 +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = xor i16 [[TMP45]], -1 +; CHECK-NEXT: [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]] +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0 +; CHECK-NEXT: [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0 +; CHECK-NEXT: [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32 +; CHECK-NEXT: [[TMP51:%.*]] = zext i1 [[TMP50]] to i32 +; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[TMP51]] +; +entry: + %0 = bitcast <8 x i64> %A to <16 x i32> + %1 = bitcast <8 x i64> %B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <8 x i64> %C to <16 x i32> + %4 = bitcast <8 x i64> %D to <16 x i32> + %5 = icmp ne <16 x i32> %3, %4 + %6 = bitcast <16 x i1> %2 to i16 + %7 = bitcast <16 x i1> %5 to i16 + %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) + ret i32 %res +} + +define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: @test_cmpps( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16 +; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) + ret i16 %res +} +declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) + +define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 { +; CHECK-LABEL: @test_cmppd( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8 +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP7]] +; + %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) + ret i8 %res +} +declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) + +define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mul_epi32_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) +; CHECK-NEXT: [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) +; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) +; CHECK-NEXT: [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP24]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %b = load <16 x i32>, ptr %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]] +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %b = load <16 x i32>, ptr %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP25]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epi32_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) +; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) +; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) +; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) + +define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) #0 { +; CHECK-LABEL: @test_mul_epu32_rr( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP19]] +; + %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rrk( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] +; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rrkz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP24]] +; + %b = load <16 x i32>, ptr %ptr_b + %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rmk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %b = load <16 x i32>, ptr %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rmkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]] +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %b = load <16 x i32>, ptr %ptr_b + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rmb( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP25]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rmbk( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru + ret < 8 x i64> %res +} + +define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mul_epu32_rmbkz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> +; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) +; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] +; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %q = load i64, ptr %ptr_b + %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 + %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer + %b = bitcast <8 x i64> %b64 to <16 x i32> + %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer + ret < 8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) + +define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b) +; +; CHECK-LABEL: @test_x86_avx512_mm_cvtu32_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[B:%.*]] to double +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP4]], i64 0 +; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[TMP5]] +; + #0 { + %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone + +define <16 x float> @test_x86_vbroadcast_ss_512(ptr %a0) #0 { +; +; CHECK-LABEL: @test_x86_vbroadcast_ss_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] +; CHECK: 2: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A0:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x float> poison, float [[TMP4]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <16 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[TMP4]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <16 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[TMP4]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <16 x i32> [[_MSPROP3]], i32 [[_MSLD]], i32 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x float> [[TMP11]], float [[TMP4]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <16 x i32> [[_MSPROP4]], i32 [[_MSLD]], i32 5 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x float> [[TMP12]], float [[TMP4]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i32> [[_MSPROP5]], i32 [[_MSLD]], i32 6 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x float> [[TMP13]], float [[TMP4]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <16 x i32> [[_MSPROP6]], i32 [[_MSLD]], i32 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x float> [[TMP14]], float [[TMP4]], i32 7 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i32> [[_MSPROP7]], i32 [[_MSLD]], i32 8 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> [[TMP15]], float [[TMP4]], i32 8 +; CHECK-NEXT: [[_MSPROP9:%.*]] = insertelement <16 x i32> [[_MSPROP8]], i32 [[_MSLD]], i32 9 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP4]], i32 9 +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <16 x i32> [[_MSPROP9]], i32 [[_MSLD]], i32 10 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP4]], i32 10 +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <16 x i32> [[_MSPROP10]], i32 [[_MSLD]], i32 11 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP4]], i32 11 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <16 x i32> [[_MSPROP11]], i32 [[_MSLD]], i32 12 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 12 +; CHECK-NEXT: [[_MSPROP13:%.*]] = insertelement <16 x i32> [[_MSPROP12]], i32 [[_MSLD]], i32 13 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP4]], i32 13 +; CHECK-NEXT: [[_MSPROP14:%.*]] = insertelement <16 x i32> [[_MSPROP13]], i32 [[_MSLD]], i32 14 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP4]], i32 14 +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i32> [[_MSPROP14]], i32 [[_MSLD]], i32 15 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP4]], i32 15 +; CHECK-NEXT: store <16 x i32> [[_MSPROP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP23]] +; + %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr) nounwind readonly + +define <8 x double> @test_x86_vbroadcast_sd_512(ptr %a0) #0 { +; +; CHECK-LABEL: @test_x86_vbroadcast_sd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] +; CHECK: 2: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[A0:%.*]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP4]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP4]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP4]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP4]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP4]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP4]], i32 7 +; CHECK-NEXT: store <8 x i64> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP15]] +; + %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr %a0) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr) nounwind readonly + +declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP7]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP18]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) + +define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) + +define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) + +define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP9]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP10]] +; + %x2 = load <16 x i32>, ptr %x2p + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %x2 = load <16 x i32>, ptr %x2p + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double> +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP20]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float> +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP9]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP20]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %x2 = load <16 x i32>, ptr %x2p + %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP23]] +; + %x2s = load double, ptr %x2ptr + %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 + %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer + %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + + +declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 9) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 9) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} + +;; mask float +define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 9) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 11) + ret <16 x float> %res +} + +;; With Passthru value +define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_passthru_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_passthru_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 9) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_passthru_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_vmulps_mask_passthru_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 11) + ret <16 x float> %res +} + +;; mask double +define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_vmulpd_mask_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 8) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_vmulpd_mask_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 9) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_vmulpd_mask_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 10) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; +; CHECK-LABEL: @test_vmulpd_mask_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 11) + ret <8 x double> %res +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_add_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_div_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_store_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask) + +define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data) #0 { +; +; CHECK-LABEL: @test_compress_store_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true)) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true)) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 -1) + ret void +} + +define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_store_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask) + +define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data) #0 { +; +; CHECK-LABEL: @test_compress_store_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true)) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true)) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 -1) + ret void +} + +define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_store_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask) + +define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data) #0 { +; +; CHECK-LABEL: @test_compress_store_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true)) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true)) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 -1) + ret void +} + +define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_store_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask) + +define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data) #0 { +; +; CHECK-LABEL: @test_compress_store_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true)) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true)) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 -1) + ret void +} + +define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_load_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP12]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_load_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask) + +define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data) #0 { +; +; CHECK-LABEL: @test_expand_load_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP8]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 -1) + ret <8 x double> %res +} + +; Make sure we don't crash if you pass 0 to the mask. +define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask) #0 { +; CHECK-LABEL: @test_zero_mask_expand_load_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> zeroinitializer, <8 x i64> [[TMP2]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP8]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 0) + ret <8 x double> %res +} + +define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_load_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP12]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_load_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer) +; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask) + +define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data) #0 { +; +; CHECK-LABEL: @test_expand_load_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP8]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 -1) + ret <16 x float> %res +} + +define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_load_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_load_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask) + +define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data) #0 { +; +; CHECK-LABEL: @test_expand_load_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP8]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 -1) + ret <8 x i64> %res +} + +define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_load_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_load_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) +; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask) + +define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data) #0 { +; +; CHECK-LABEL: @test_expand_load_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP8]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 -1) + ret <16 x i32> %res +} + +define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_min_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_min_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_min_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mm512_mask_max_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_max_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_max_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_sqrt_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) +; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> undef, i8 -1, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sqrt_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> %passthru, i8 %mask, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_sqrt_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_sqrt_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP5]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> undef, i8 -1, i32 11) + ret <8 x double> %res +} +define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sqrt_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> %passthru, i8 %mask, i32 11) + ret <8 x double> %res +} +define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_sqrt_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP14]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 11) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_sqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) +; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_sqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_sqrt_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP5]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} +define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_sqrt_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 11) + ret <16 x float> %res +} +define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_sqrt_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP14]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 11) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP7]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP16]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP15]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP16]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512_old( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP15]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prol_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_prol_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32>, i32, <16 x i32>, i16) + +define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pror_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 +; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) + %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 + %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 + %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 + ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 +} + +declare <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64>, i32, <8 x i64>, i8) + +define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pror_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 +; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 +; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 +; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 +; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) + %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 + %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 + %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 + ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 +} + +declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP5]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP27]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP5]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP27]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP16]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP11]], double [[TMP8]], double 0.000000e+00 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[X0]], double [[TMP17]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]] +; CHECK-NEXT: [[_MSCMP15:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]] +; CHECK-NEXT: br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] +; CHECK: 22: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 23: +; CHECK-NEXT: [[TMP24:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]], i32 11) +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 0, i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast double [[TMP24]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = xor i64 [[TMP29]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP30]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP31]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP32]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP27]], double [[TMP24]], double 0.000000e+00 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT11]], i64 0 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x double> [[X0]], double [[TMP33]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP12]] +; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[TMP18]], [[TMP34]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP13]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES2]] +; + %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 11) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[TMP14]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP16]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP11]], float [[TMP8]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[X0]], float [[TMP17]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]] +; CHECK-NEXT: [[_MSCMP15:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]] +; CHECK-NEXT: br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] +; CHECK: 22: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 23: +; CHECK-NEXT: [[TMP24:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]], i32 11) +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast float [[TMP24]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP29]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP31]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i32 [[TMP32]], i32 [[TMP28]] +; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP27]], float [[TMP24]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT11]], i64 0 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[X0]], float [[TMP33]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP12]] +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[TMP18]], [[TMP34]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP18]] +; + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 11) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res +} +declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP7]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP29]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP7]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP29]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { +; +; CHECK-LABEL: @fmadd_ss_mask_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 +; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 +; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 +; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast float [[TMP14]] to i32 +; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]] +; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]] +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0 +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] +; CHECK: 29: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 30: +; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 +; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr +; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP33]], align 4 +; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 +; CHECK-NEXT: ret void +; + %a.val = load float, ptr %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, ptr %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + + %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) + + %sr = extractelement <4 x float> %vr, i32 0 + store float %sr, ptr %a + ret void +} + +define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { +; +; CHECK-LABEL: @fmadd_ss_maskz_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 +; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 +; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 +; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = xor i32 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]] +; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0 +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 +; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr +; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP32]], align 4 +; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 +; CHECK-NEXT: ret void +; + %a.val = load float, ptr %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, ptr %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + + %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) + + %sr = extractelement <4 x float> %vr, i32 0 + store float %sr, ptr %a + ret void +} + +define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { +; +; CHECK-LABEL: @fmadd_sd_mask_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 +; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] +; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast double [[TMP14]] to i64 +; CHECK-NEXT: [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]] +; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]] +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] +; CHECK: 29: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 30: +; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 +; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr +; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP33]], align 8 +; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 +; CHECK-NEXT: ret void +; + %a.val = load double, ptr %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, ptr %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + + %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) + + %sr = extractelement <2 x double> %vr, i32 0 + store double %sr, ptr %a + ret void +} + +define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { +; +; CHECK-LABEL: @fmadd_sd_maskz_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 +; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] +; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = xor i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]] +; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00 +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 +; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr +; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP32]], align 8 +; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 +; CHECK-NEXT: ret void +; + %a.val = load double, ptr %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, ptr %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + + %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) + + %sr = extractelement <2 x double> %vr, i32 0 + store double %sr, ptr %a + ret void +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast double [[TMP9]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] +; CHECK: 26: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 27: +; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] +; CHECK: 35: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 36: +; CHECK-NEXT: [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]] +; CHECK-NEXT: [[TMP43:%.*]] = bitcast double [[TMP37]] to i64 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast double [[TMP38]] to i64 +; CHECK-NEXT: [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]] +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float [[TMP9]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] +; CHECK: 26: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 27: +; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] +; CHECK: 35: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 36: +; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]] +; CHECK-NEXT: [[TMP43:%.*]] = bitcast float [[TMP37]] to i32 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast float [[TMP38]] to i32 +; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast double [[TMP11]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = fneg <2 x double> [[X0]] +; CHECK-NEXT: [[TMP24:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = fneg <2 x double> [[X0]] +; CHECK-NEXT: [[TMP34:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] +; CHECK: 38: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 39: +; CHECK-NEXT: [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]] +; CHECK-NEXT: [[TMP46:%.*]] = bitcast double [[TMP40]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = bitcast double [[TMP41]] to i64 +; CHECK-NEXT: [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = or i64 [[TMP48]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP11]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = fneg <4 x float> [[X0]] +; CHECK-NEXT: [[TMP24:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = fneg <4 x float> [[X0]] +; CHECK-NEXT: [[TMP34:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] +; CHECK: 38: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 39: +; CHECK-NEXT: [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]] +; CHECK-NEXT: [[TMP46:%.*]] = bitcast float [[TMP40]] to i32 +; CHECK-NEXT: [[TMP47:%.*]] = bitcast float [[TMP41]] to i32 +; CHECK-NEXT: [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]] +; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP12]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP24]] +; + %q = load float, ptr %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP10]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP24]] +; + %q = load float, ptr %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP14]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP19]] +; + %q = load float, ptr %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) + ret < 4 x float> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> +; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP11]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP10]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer +; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { +; +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer +; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_compress_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 { +; CHECK-LABEL: @test_compress_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> undef, i8 -1) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) + +define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_compress_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP9]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 { +; CHECK-LABEL: @test_compress_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> undef, i16 -1) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask) + +define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_compress_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP9]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 { +; CHECK-LABEL: @test_compress_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask) + +define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_compress_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_compress_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 { +; CHECK-LABEL: @test_compress_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask) + +define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 { +; CHECK-LABEL: @test_expand_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> undef, i8 -1) + ret <8 x double> %res +} + +define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) + +define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 { +; CHECK-LABEL: @test_expand_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> undef, i16 -1) + ret <16 x float> %res +} + +define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP9]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask) + +define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 { +; CHECK-LABEL: @test_expand_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP9]] +; + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask) + +define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 { +; CHECK-LABEL: @test_expand_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { +; +; CHECK-LABEL: @test_mask_expand_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 { +; +; CHECK-LABEL: @test_maskz_expand_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask) + +define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, ptr %p) #0 { +; +; CHECK-LABEL: @test_cmp_512( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 1, <16 x i1> splat (i1 true), i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[C:%.*]], <16 x float> [[D:%.*]], i32 1, <16 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP5]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr [[P:%.*]], align 64 +; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], 87960930222080 +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP20]], align 64 +; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i1> [[TMP9]], [[TMP14]] +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> zeroinitializer, <16 x i32> [[_MSLD]] +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x float> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i32> zeroinitializer, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[TMP25]], [[_MSLD]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP26]], <16 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP21]], <16 x float> zeroinitializer, <16 x float> [[TMP17]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP27]] +; + entry: + %0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8) + %1 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i32 4) + %2 = load <16 x float>, ptr %p + %3 = xor <16 x i1> %0, %1 + %4 = select <16 x i1> %3, <16 x float> zeroinitializer, <16 x float> %2 + ret <16 x float> %4 +} + +declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) + +attributes #0 = { sanitize_memory } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll new file mode 100644 index 0000000000000..052b497831ee1 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll @@ -0,0 +1,13714 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512-intrinsics.ll + +define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_compress_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1) + ret <8 x double> %2 +} + +define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_compress_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1) + ret <8 x double> %2 +} + +define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 { +; CHECK-LABEL: @test_compress_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> ) + ret <8 x double> %1 +} + +define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_mask_compress_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1) + ret <16 x float> %2 +} + +define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 { +; CHECK-LABEL: @test_maskz_compress_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP9]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1) + ret <16 x float> %2 +} + +define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 { +; CHECK-LABEL: @test_compress_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> ) + ret <16 x float> %1 +} + +define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_compress_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1) + ret <8 x i64> %2 +} + +define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_compress_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP9]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1) + ret <8 x i64> %2 +} + +define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 { +; CHECK-LABEL: @test_compress_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> ) + ret <8 x i64> %1 +} + +define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_mask_compress_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1) + ret <16 x i32> %2 +} + +define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 { +; CHECK-LABEL: @test_maskz_compress_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1) + ret <16 x i32> %2 +} + +define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 { +; CHECK-LABEL: @test_compress_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> ) + ret <16 x i32> %1 +} + +define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 { +; CHECK-LABEL: @test_expand_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> ) + ret <8 x double> %1 +} + +define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_expand_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1) + ret <8 x double> %2 +} + +define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_expand_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1) + ret <8 x double> %2 +} + +define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 { +; CHECK-LABEL: @test_expand_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> ) + ret <16 x float> %1 +} + +define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_mask_expand_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1) + ret <16 x float> %2 +} + +define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 { +; CHECK-LABEL: @test_maskz_expand_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP9]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1) + ret <16 x float> %2 +} + +define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 { +; CHECK-LABEL: @test_expand_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> ) + ret <8 x i64> %1 +} + +define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_expand_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1) + ret <8 x i64> %2 +} + +define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_expand_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP9]] +; + %1 = bitcast i8 %mask to <8 x i1> + %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1) + ret <8 x i64> %2 +} + +define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 { +; CHECK-LABEL: @test_expand_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> ) + ret <16 x i32> %1 +} + +define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_mask_expand_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1) + ret <16 x i32> %2 +} + +define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 { +; CHECK-LABEL: @test_maskz_expand_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1) + ret <16 x i32> %2 +} + +define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_rcp_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone + +define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_rcp_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone + +declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32) + +define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: @test_rndscale_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 11, i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4) + ret <2 x double>%res +} + +define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 { +; CHECK-LABEL: @test_rndscale_sd_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4) + ret <2 x double>%res +} + +define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) #0 { +; CHECK-LABEL: @test_rndscale_sd_mask_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[B:%.*]] = load <2 x double>, ptr [[BPTR:%.*]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[BPTR]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %b = load <2 x double>, ptr %bptr + %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4) + ret <2 x double>%res +} + +define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) #0 { +; CHECK-LABEL: @test_rndscale_sd_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4) + ret <2 x double>%res +} + +declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32) + +define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: @test_rndscale_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 11, i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4) + ret <4 x float>%res +} + +define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr) #0 { +; CHECK-LABEL: @test_rndscale_ss_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load <4 x float>, ptr [[BPTR:%.*]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[BPTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B]], <4 x float> undef, i8 -1, i32 11, i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %b = load <4 x float>, ptr %bptr + %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4) + ret <4 x float>%res +} + +define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 { +; CHECK-LABEL: @test_rndscale_ss_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4) + ret <4 x float>%res +} + +define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) #0 { +; CHECK-LABEL: @test_rndscale_ss_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4) + ret <4 x float>%res +} + +declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <8 x double> @test7(<8 x double> %a) #0 { +; CHECK-LABEL: @test7( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> [[A:%.*]], i32 11, <8 x double> [[A]], i8 -1, i32 4) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4) + ret <8 x double>%res +} + +declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + +define <16 x float> @test8(<16 x float> %a) #0 { +; CHECK-LABEL: @test8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> [[A:%.*]], i32 11, <16 x float> [[A]], i16 -1, i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4) + ret <16 x float>%res +} + +define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_rsqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone + +define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_sqrt_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) +; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) + ret <8 x double> %1 +} + +define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_sqrt_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP13]] +; + %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_sqrt_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP11]] +; + %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} +declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) + +define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_sqrt_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP5]] +; + %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) + ret <8 x double> %1 +} + +define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_sqrt_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_sqrt_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP14]] +; + %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone + +define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_sqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) +; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) + ret <16 x float> %1 +} + +define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_mask_sqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP13]] +; + %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 { +; CHECK-LABEL: @test_maskz_sqrt_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP11]] +; + %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} +declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) + +define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_sqrt_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP5]] +; + %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_mask_sqrt_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 { +; CHECK-LABEL: @test_maskz_sqrt_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP14]] +; + %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone + +define <8 x double> @test_getexp_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_getexp_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 4) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) #0 { +; CHECK-LABEL: @test_getexp_round_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 12) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 12) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_getexp_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_getexp_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) #0 { +; CHECK-LABEL: @test_getexp_round_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_sqrt_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] +; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 9) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]] +; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] +; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 [[MASK]], i32 10) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0 +; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]] +; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; CHECK: 21: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 22: +; CHECK-NEXT: [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 -1, i32 11) +; CHECK-NEXT: [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES_2:%.*]] = fadd <4 x float> [[RES2]], [[RES3]] +; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[RES_1]], [[RES_2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9) + %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10) + %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11) + + %res.1 = fadd <4 x float> %res0, %res1 + %res.2 = fadd <4 x float> %res2, %res3 + %res = fadd <4 x float> %res.1, %res.2 + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_sqrt_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] +; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 9) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]] +; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] +; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 [[MASK]], i32 10) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0 +; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]] +; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] +; CHECK: 21: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 22: +; CHECK-NEXT: [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 -1, i32 11) +; CHECK-NEXT: [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES_2:%.*]] = fadd <2 x double> [[RES2]], [[RES3]] +; CHECK-NEXT: [[RES:%.*]] = fadd <2 x double> [[RES_1]], [[RES_2]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9) + %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10) + %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11) + + %res.1 = fadd <2 x double> %res0, %res1 + %res.2 = fadd <2 x double> %res2, %res3 + %res = fadd <2 x double> %res.1, %res.2 + ret <2 x double> %res +} + +define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvttsd2usi( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0:%.*]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0]], i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES2]] +; + %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ; + %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvttsd2si( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0:%.*]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0]], i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES2]] +; + %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ; + %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvttss2si( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0:%.*]], i32 8) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0]], i32 4) +; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES2]] +; + %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ; + %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvttss2si_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] +; CHECK: 2: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 3: +; CHECK-NEXT: [[A1:%.*]] = load <4 x float>, ptr [[A0:%.*]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A1]], i32 4) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %a1 = load <4 x float>, ptr %a0 + %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ; + ret i32 %res +} + +define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvttss2usi( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0:%.*]], i32 8) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0]], i32 4) +; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES2]] +; + %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ; + %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ; + %res2 = add i32 %res0, %res1 + ret i32 %res2 +} +declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvtsd2usi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0:%.*]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 9) +; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES4]] +; + %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11) + %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvtsd2si32( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0:%.*]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 9) +; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES4]] +; + %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11) + %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvtss2usi32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0:%.*]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i32 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 9) +; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES4]] +; + %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11) + %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone + +define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_cvtss2si32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0:%.*]], i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 9) +; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES4]] +; + %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) + %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11) + %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9) + %res3 = add i32 %res, %res1 + %res4 = add i32 %res3, %res2 + ret i32 %res4 +} +declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone + +define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) #0 { +; CHECK-LABEL: @test_x86_vcvtps2ph_256( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]]) +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080 +; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32 +; CHECK-NEXT: store <16 x i16> [[RES1]], ptr [[DST]], align 32 +; CHECK-NEXT: [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]] +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[RES]] +; + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask) + %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask) + store <16 x i16> %res1, ptr %dst + %res = add <16 x i16> %res2, %res3 + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly + +define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: @test_cmpps( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16 +; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> , i32 8) + %1 = bitcast <16 x i1> %res to i16 + ret i16 %1 +} +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) + +define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 { +; CHECK-LABEL: @test_cmppd( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8 +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[TMP7]] +; + %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> , i32 4) + %1 = bitcast <8 x i1> %res to i8 + ret i8 %1 +} +declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32) + + + ; fp min - max +define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) #0 { +; CHECK-LABEL: @test_vmaxpd( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP7]] +; + %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) + ret <8 x double> %1 +} +declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) + +define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) #0 { +; CHECK-LABEL: @test_vminpd( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP7]] +; + %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) + ret <8 x double> %1 +} +declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) + +define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_store_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP1]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[MASK]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP9]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 16: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 17: +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]]) +; CHECK-NEXT: ret void +; + %1 = and i8 %mask, 1 + %2 = bitcast i8 %1 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + call void @llvm.masked.store.v4f32.p0(<4 x float> %data, ptr %ptr, i32 1, <4 x i1> %extract) + ret void +} +declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) #1 + + +declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) +declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) +declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32) + +define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + ret <16 x float> %1 +} + +define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + ret <16 x float> %1 +} + +define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vsubps_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + ret <16 x float> %1 +} + +define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + ret <16 x float> %1 +} + +define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 { +; CHECK-LABEL: @test_vmulps_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_passthru_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_passthru_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_passthru_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vmulps_mask_passthru_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_vmulpd_mask_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_vmulpd_mask_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_vmulpd_mask_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_vmulpd_mask_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_add_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_add_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 +} +declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) + +define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_sub_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_div_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_div_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 +} +declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) + +define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_min_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_min_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_min_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 +} +declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) + +define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_mask_max_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 +} + +define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_max_round_ps_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_mm512_max_round_ps_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 +} +declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) + +declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_ss_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_ss_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 9) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_ss_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 10) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_ss_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 11) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_ss_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_add_ss_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 { +; CHECK-LABEL: @test_add_ss_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_ss_current_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %a1.val = load float, ptr %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_add_ss_current_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %a1.val = load float, ptr %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_sd_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_sd_rd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 9) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_sd_ru( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 10) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_sd_rz( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 11) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_sd_current( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_add_sd_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_add_sd_rn( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_add_sd_current_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %a1.val = load double, ptr %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_add_sd_current_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %a1.val = load double, ptr %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_max_ss_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_max_ss_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 { +; CHECK-LABEL: @test_max_ss_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_max_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_max_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 { +; CHECK-LABEL: @test_max_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_max_ss_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %a1.val = load float, ptr %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_max_ss_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %a1.val = load float, ptr %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} +declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_max_sd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_max_sd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_max_sd_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_max_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_max_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_max_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_max_sd_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %a1.val = load double, ptr %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_max_sd_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %a1.val = load double, ptr %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) #0 { +; CHECK-LABEL: @test_x86_avx512_cvtsi2ss32( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 11) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone + +define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) #0 { +; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 9) +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) #0 { +; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss_mem( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 9) +; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %b = load i32, ptr %ptr + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 { +; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 4) +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) #0 { +; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss_mem( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 4) +; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %b = load i32, ptr %ptr + %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone + +declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 +} + +declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) + +define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP9]] +; + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) + ret <8 x double> %1 +} + +define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP20]] +; + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) + %2 = bitcast <8 x i64> %x1 to <8 x double> + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2 + ret <8 x double> %4 +} + +declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) + +define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP9]] +; + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) + ret <16 x float> %1 +} + +define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP20]] +; + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) + %2 = bitcast <16 x i32> %x1 to <16 x float> + %3 = bitcast i16 %x3 to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 + ret <16 x float> %4 +} + +declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) + +define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP4]] +; + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 +} + +define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 + ret <8 x i64> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %x2 = load <16 x i32>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer +; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP23]] +; + %x2s = load double, ptr %x2ptr + %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 + %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP4]] +; + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 +} + +declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x double> [[X2]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11) + %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 10) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x float> [[X2]], i16 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10) + %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i16> [[RES4]] +; + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> +; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = trunc <8 x i64> %x0 to <8 x i32> + ret <8 x i32> %1 +} + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP11]] +; + %1 = trunc <8 x i64> %x0 to <8 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[TMP10]] +; + %1 = trunc <8 x i64> %x0 to <8 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + ret <8 x i32> %3 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovs_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pmovus_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_qd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[RES4]] +; + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[RES4]] +; + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[RES4]] +; + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]]) +; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[RES4]] +; + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_mem_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32) + +define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer +; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %cvt = sitofp <16 x i32> %x0 to <16 x float> + %1 = bitcast i16 %x2 to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1 + %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8) + %res2 = fadd <16 x float> %2, %3 + ret <16 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2dq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES2]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32) + +define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0:%.*]], <8 x float> [[X1:%.*]], i8 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0]], <8 x float> [[X1]], i8 -1, i32 10) +; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[RES2]] +; + %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2udq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 10) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES2]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2dq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0]], <8 x double> [[X1]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2udq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2dq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES2]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32) + +define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer +; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %cvt = uitofp <16 x i32> %x0 to <16 x float> + %1 = bitcast i16 %x2 to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1 + %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8) + %res2 = fadd <16 x float> %2, %3 + ret <16 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2udq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i32> [[RES2]] +; + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2dq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32) + +define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2udq_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 { +; CHECK-LABEL: @test_getexp_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_getexp_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] +; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 8) +; CHECK-NEXT: [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES_1]] +; + %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) + %res.1 = fadd <4 x float> %res0, %res1 + ret <4 x float> %res.1 +} + +define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_getexp_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_getexp_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_mask_getexp_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] +; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 8) +; CHECK-NEXT: [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES_1]] +; + %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) + %res.1 = fadd <2 x double> %res0, %res1 + ret <2 x double> %res.1 +} + +define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_maskz_getexp_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) + ret <2 x double> %res +} + +declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32) + +define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 5, i8 [[X3:%.*]], i32 8) +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[RES4]] +; + %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) + ret i8 %res4 +} + +define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd_all( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 2, i8 -1, i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 3, i8 -1, i32 8) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]] +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 4, i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0 +; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]] +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]] +; CHECK-NEXT: br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] +; CHECK: 18: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 19: +; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 5, i8 [[X3]], i32 8) +; CHECK-NEXT: [[TMP20:%.*]] = xor i8 [[RES1]], -1 +; CHECK-NEXT: [[TMP21:%.*]] = xor i8 [[RES2]], -1 +; CHECK-NEXT: [[TMP22:%.*]] = and i8 [[TMP20]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = and i8 0, [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = or i8 0, [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = or i8 [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[RES11:%.*]] = or i8 [[RES1]], [[RES2]] +; CHECK-NEXT: [[TMP26:%.*]] = xor i8 [[RES3]], -1 +; CHECK-NEXT: [[TMP27:%.*]] = xor i8 [[RES4]], -1 +; CHECK-NEXT: [[TMP28:%.*]] = and i8 [[TMP26]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = and i8 0, [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = or i8 0, [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = or i8 [[TMP30]], [[TMP29]] +; CHECK-NEXT: [[RES12:%.*]] = or i8 [[RES3]], [[RES4]] +; CHECK-NEXT: [[TMP32:%.*]] = xor i8 [[RES11]], -1 +; CHECK-NEXT: [[TMP33:%.*]] = xor i8 [[RES12]], -1 +; CHECK-NEXT: [[TMP34:%.*]] = and i8 [[TMP25]], [[TMP31]] +; CHECK-NEXT: [[TMP35:%.*]] = and i8 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = and i8 [[TMP25]], [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = or i8 [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = or i8 [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[RES13:%.*]] = or i8 [[RES11]], [[RES12]] +; CHECK-NEXT: store i8 [[TMP38]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[RES13]] +; + %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) + %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8) + %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4) + %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) + + %res11 = or i8 %res1, %res2 + %res12 = or i8 %res3, %res4 + %res13 = or i8 %res11, %res12 + ret i8 %res13 +} + +declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32) + +define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 3, i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[RES2]] +; + %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) + ret i8 %res2 +} + + +define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss_all( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 2, i8 -1, i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 3, i8 -1, i32 8) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]] +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 4, i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0 +; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]] +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]] +; CHECK-NEXT: br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] +; CHECK: 18: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 19: +; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 5, i8 [[X3]], i32 8) +; CHECK-NEXT: [[TMP20:%.*]] = and i8 [[RES1]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = and i8 0, [[RES2]] +; CHECK-NEXT: [[TMP22:%.*]] = or i8 0, [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or i8 [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[RES11:%.*]] = and i8 [[RES1]], [[RES2]] +; CHECK-NEXT: [[TMP24:%.*]] = and i8 [[RES3]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = and i8 0, [[RES4]] +; CHECK-NEXT: [[TMP26:%.*]] = or i8 0, [[TMP24]] +; CHECK-NEXT: [[TMP27:%.*]] = or i8 [[TMP26]], [[TMP25]] +; CHECK-NEXT: [[RES12:%.*]] = and i8 [[RES3]], [[RES4]] +; CHECK-NEXT: [[TMP28:%.*]] = and i8 [[TMP23]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = and i8 [[RES11]], [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = and i8 [[TMP23]], [[RES12]] +; CHECK-NEXT: [[TMP31:%.*]] = or i8 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = or i8 [[TMP31]], [[TMP30]] +; CHECK-NEXT: [[RES13:%.*]] = and i8 [[RES11]], [[RES12]] +; CHECK-NEXT: store i8 [[TMP32]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i8 [[RES13]] +; + %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) + %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8) + %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4) + %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8) + + %res11 = and i8 %res1, %res2 + %res12 = and i8 %res3, %res4 + %res13 = and i8 %res11, %res12 + ret i8 %res13 +} + +declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0:%.*]], i32 11, <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0]], i32 11, <8 x double> [[X2]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES2]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0:%.*]], i32 11, <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] +; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0]], i32 11, <16 x float> [[X2]], i16 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 11, <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 12, <2 x double> zeroinitializer, i8 [[X3]], i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] +; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 13, <2 x double> [[X2]], i8 [[X3]], i32 8) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0 +; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP21:%.*]] = icmp ne i128 [[TMP21]], 0 +; CHECK-NEXT: [[_MSOR22:%.*]] = or i1 [[_MSOR20]], [[_MSCMP21]] +; CHECK-NEXT: br i1 [[_MSOR22]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] +; CHECK: 22: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 23: +; CHECK-NEXT: [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 14, <2 x double> [[X2]], i8 -1, i32 4) +; CHECK-NEXT: [[RES11:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES12:%.*]] = fadd <2 x double> [[RES2]], [[RES3]] +; CHECK-NEXT: [[RES13:%.*]] = fadd <2 x double> [[RES11]], [[RES12]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES13]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 13, <2 x double> %x2, i8 %x3, i32 8) + %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 14, <2 x double> %x2, i8 -1, i32 4) + %res11 = fadd <2 x double> %res, %res1 + %res12 = fadd <2 x double> %res2, %res3 + %res13 = fadd <2 x double> %res11, %res12 + ret <2 x double> %res13 +} + +declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 11, <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 12, <4 x float> zeroinitializer, i8 [[X3]], i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 13, <4 x float> [[X2]], i8 -1, i32 8) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i128 [[TMP19]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i128 [[TMP20]], 0 +; CHECK-NEXT: [[_MSOR18:%.*]] = or i1 [[_MSCMP16]], [[_MSCMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP21]], 0 +; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSOR18]], [[_MSCMP19]] +; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] +; CHECK: 22: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 23: +; CHECK-NEXT: [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 14, <4 x float> [[X2]], i8 -1, i32 4) +; CHECK-NEXT: [[RES11:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES12:%.*]] = fadd <4 x float> [[RES2]], [[RES3]] +; CHECK-NEXT: [[RES13:%.*]] = fadd <4 x float> [[RES11]], [[RES12]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES13]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 12, <4 x float> zeroinitializer, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 13, <4 x float> %x2, i8 -1, i32 8) + %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 14, <4 x float> %x2, i8 -1, i32 4) + %res11 = fadd <4 x float> %res, %res1 + %res12 = fadd <4 x float> %res2, %res3 + %res13 = fadd <4 x float> %res11, %res12 + ret <4 x float> %res13 +} + +define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[X1:%.*]] = load <4 x float>, ptr [[X1P:%.*]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[X1P]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1]], i32 11, <4 x float> undef, i8 -1, i32 4) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES]] +; + %x1 = load <4 x float>, ptr %x1p + %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES2]] +; + %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2 + ret <8 x double> %res2 +} + +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES2]] +; + %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2 + ret <16 x float> %res2 +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer + ret <16 x float> %res2 +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK: 3: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 4: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2 + ret <16 x float> %res2 +} + +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 { +; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES2]] +; + %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer + ret <16 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ss2sd_round( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0]], <4 x float> [[X1]], <2 x double> [[X2]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES2]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_sd2ss_round( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0:%.*]], <2 x double> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0]], <2 x double> [[X1]], <4 x float> [[X2]], i8 -1, i32 8) +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES2]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11) + %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) + +define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP9]] +; + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP17]] +; + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) + +define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { +; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP9]] +; + %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + ret <8 x i64> %1 +} + +define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + ret <8 x i64> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; + %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_comi_sd_eq_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 8) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 8) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_comi_sd_eq( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 4) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 4) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) + ret i32 %res +} + +define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_comi_sd_lt_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 8) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt_sae( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 8) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) + ret i32 %res +} + +define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_comi_sd_lt( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 4) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) + ret i32 %res +} + +define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 4) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) + +define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_ucomi_ss_lt( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i32 9, i32 4) +; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) + +declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) + +define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP7]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + ret <8 x double> %1 +} + +define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP18]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2 + ret <8 x double> %3 +} + +define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[TMP16]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) + +define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + ret <8 x i64> %1 +} + +define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP12]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 + ret <8 x i64> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP11]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) + +define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP7]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + ret <16 x float> %1 +} + +define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2 + ret <16 x float> %3 +} + +define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP16]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} + +declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) #0 { +; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP12]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP11]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 4, i8 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> [[X1]], <8 x i64> [[X2]], i32 5, i8 [[X4]], i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 3, i8 -1, i32 8) +; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES4]] +; + %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) + %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X2:%.*]] = load <8 x i64>, ptr [[X2PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[_MSLD]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2]], i32 3, i8 -1, i32 4) +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %x2 = load <8 x i64>, ptr %x2ptr + %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) + +define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_pd_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 3, i8 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 2, i8 -1, i32 8) +; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES4]] +; + %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) + %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 -1, i32 8) +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res3, %res2 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32) + +define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 6, i8 -1, i32 4) +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8) + %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 6, i8 -1, i32 4) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res3, %res2 + ret <4 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 5, i16 [[X4]], i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 5, i16 -1, i32 8) +; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES4]] +; + %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) + %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res3, %res2 + ret <16 x float> %res4 +} + +define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512_load( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2PTR:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2]], i32 5, i16 -1, i32 4) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %x2 = load <16 x i32>, ptr %x2ptr + %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32) + +define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ps_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 6, i16 [[X4]], i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 7, i16 -1, i32 4) +; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]] +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES4]] +; + %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 6, i16 %x4, i32 8) + %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 7, i16 -1, i32 4) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res3, %res2 + ret <16 x float> %res4 +} + +declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 6, i8 -1, i32 4) +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) + %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 6, i8 -1, i32 4) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32) + +define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] +; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 +; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] +; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] +; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] +; CHECK: 17: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 18: +; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]], i32 8) +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) + %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare double @llvm.fma.f64(double, double, double) #1 +declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0 + +define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP5]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP27]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %1 = extractelement <2 x double> %x0, i64 0 + %2 = extractelement <2 x double> %x1, i64 0 + %3 = extractelement <2 x double> %x2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double %1 + %8 = insertelement <2 x double> %x0, double %7, i64 0 + %9 = extractelement <2 x double> %x0, i64 0 + %10 = extractelement <2 x double> %x1, i64 0 + %11 = extractelement <2 x double> %x2, i64 0 + %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11) + %13 = insertelement <2 x double> %x0, double %12, i64 0 + %14 = extractelement <2 x double> %x0, i64 0 + %15 = extractelement <2 x double> %x1, i64 0 + %16 = extractelement <2 x double> %x2, i64 0 + %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10) + %18 = bitcast i8 %x3 to <8 x i1> + %19 = extractelement <8 x i1> %18, i64 0 + %20 = select i1 %19, double %17, double %14 + %21 = insertelement <2 x double> %x0, double %20, i64 0 + %res3 = fadd <2 x double> %8, %13 + %res4 = fadd <2 x double> %21, %res3 + ret <2 x double> %res4 +} + +define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP5]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP27]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %x2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %1 + %8 = insertelement <4 x float> %x0, float %7, i64 0 + %9 = extractelement <4 x float> %x0, i64 0 + %10 = extractelement <4 x float> %x1, i64 0 + %11 = extractelement <4 x float> %x2, i64 0 + %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11) + %13 = insertelement <4 x float> %x0, float %12, i64 0 + %14 = extractelement <4 x float> %x0, i64 0 + %15 = extractelement <4 x float> %x1, i64 0 + %16 = extractelement <4 x float> %x2, i64 0 + %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10) + %18 = bitcast i8 %x3 to <8 x i1> + %19 = extractelement <8 x i1> %18, i64 0 + %20 = select i1 %19, float %17, float %14 + %21 = insertelement <4 x float> %x0, float %20, i64 0 + %res3 = fadd <4 x float> %8, %13 + %res4 = fadd <4 x float> %21, %res3 + ret <4 x float> %res4 +} + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd( +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 0, i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP4]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP10]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i64 [[TMP11]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> [[X0]], double [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]], i32 11) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 0, i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast double [[TMP17]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select i1 false, i64 [[TMP24]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], double [[TMP17]], double 0.000000e+00 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0 +; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[TMP13]], [[TMP26]] +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES2]] +; + %1 = extractelement <2 x double> %x0, i64 0 + %2 = extractelement <2 x double> %x1, i64 0 + %3 = extractelement <2 x double> %x2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double 0.000000e+00 + %8 = insertelement <2 x double> %x0, double %7, i64 0 + %9 = extractelement <2 x double> %x0, i64 0 + %10 = extractelement <2 x double> %x1, i64 0 + %11 = extractelement <2 x double> %x2, i64 0 + %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11) + %13 = bitcast i8 %x3 to <8 x i1> + %14 = extractelement <8 x i1> %13, i64 0 + %15 = select i1 %14, double %12, double 0.000000e+00 + %16 = insertelement <2 x double> %x0, double %15, i64 0 + %res2 = fadd <2 x double> %8, %16 + ret <2 x double> %res2 +} + +declare float @llvm.fma.f32(float, float, float) #1 +declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0 + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss( +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float [[TMP4]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP10]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP11]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[X0]], float [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]], i32 11) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP17]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select i1 false, i32 [[TMP24]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP17]], float 0.000000e+00 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0 +; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[TMP13]], [[TMP26]] +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES2]] +; + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %x2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float 0.000000e+00 + %8 = insertelement <4 x float> %x0, float %7, i64 0 + %9 = extractelement <4 x float> %x0, i64 0 + %10 = extractelement <4 x float> %x1, i64 0 + %11 = extractelement <4 x float> %x2, i64 0 + %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11) + %13 = bitcast i8 %x3 to <8 x i1> + %14 = extractelement <8 x i1> %13, i64 0 + %15 = select i1 %14, float %12, float 0.000000e+00 + %16 = insertelement <4 x float> %x0, float %15, i64 0 + %res2 = fadd <4 x float> %8, %16 + ret <4 x float> %res2 +} + +define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_load0( +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[TMP1:%.*]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 16 +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[_MSLD]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], [[TMP6]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or i32 [[_MSPROP1]], [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.fma.f32(float [[TMP15]], float [[TMP2:%.*]], float [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP0:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <8 x i1> [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[_MSPROP2]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP16]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP3]], i32 [[TMP24]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP16]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <4 x i32> [[_MSLD]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP25]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP26]] +; + %5 = load <4 x float>, ptr %1, align 16 + %6 = extractelement <4 x float> %5, i64 0 + %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2 + %8 = bitcast i8 %0 to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float 0.000000e+00 + %11 = insertelement <4 x float> %5, float %10, i64 0 + ret <4 x float> %11 +} + +define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP7]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP29]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %1 = extractelement <2 x double> %x0, i64 0 + %2 = extractelement <2 x double> %x1, i64 0 + %3 = extractelement <2 x double> %x2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double %3 + %8 = insertelement <2 x double> %x2, double %7, i64 0 + %9 = extractelement <2 x double> %x0, i64 0 + %10 = extractelement <2 x double> %x1, i64 0 + %11 = extractelement <2 x double> %x2, i64 0 + %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11) + %13 = insertelement <2 x double> %x2, double %12, i64 0 + %14 = extractelement <2 x double> %x0, i64 0 + %15 = extractelement <2 x double> %x1, i64 0 + %16 = extractelement <2 x double> %x2, i64 0 + %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10) + %18 = bitcast i8 %x3 to <8 x i1> + %19 = extractelement <8 x i1> %18, i64 0 + %20 = select i1 %19, double %17, double %16 + %21 = insertelement <2 x double> %x2, double %20, i64 0 + %res3 = fadd <2 x double> %8, %13 + %res4 = fadd <2 x double> %21, %res3 + ret <2 x double> %res4 +} + +define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP7]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]] +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 +; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] +; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] +; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] +; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] +; CHECK: 30: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 31: +; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP29]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 +; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]] +; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]] +; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0 +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %x2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %3 + %8 = insertelement <4 x float> %x2, float %7, i64 0 + %9 = extractelement <4 x float> %x0, i64 0 + %10 = extractelement <4 x float> %x1, i64 0 + %11 = extractelement <4 x float> %x2, i64 0 + %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11) + %13 = insertelement <4 x float> %x2, float %12, i64 0 + %14 = extractelement <4 x float> %x0, i64 0 + %15 = extractelement <4 x float> %x1, i64 0 + %16 = extractelement <4 x float> %x2, i64 0 + %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10) + %18 = bitcast i8 %x3 to <8 x i1> + %19 = extractelement <8 x i1> %18, i64 0 + %20 = select i1 %19, float %17, float %16 + %21 = insertelement <4 x float> %x2, float %20, i64 0 + %res3 = fadd <4 x float> %8, %13 + %res4 = fadd <4 x float> %21, %res3 + ret <4 x float> %res4 +} + +define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { +; CHECK-LABEL: @fmadd_ss_mask_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 +; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 +; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 +; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast float [[TMP14]] to i32 +; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]] +; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]] +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0 +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] +; CHECK: 29: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 30: +; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 +; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr +; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP33]], align 4 +; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 +; CHECK-NEXT: ret void +; + %a.val = load float, ptr %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, ptr %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + %1 = extractelement <4 x float> %av, i64 0 + %2 = extractelement <4 x float> %bv, i64 0 + %3 = extractelement <4 x float> %av, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %1 + %8 = insertelement <4 x float> %av, float %7, i64 0 + %sr = extractelement <4 x float> %8, i32 0 + store float %sr, ptr %a + ret void +} + +define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { +; CHECK-LABEL: @fmadd_ss_maskz_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 +; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 +; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 +; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 +; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 +; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 +; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] +; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = xor i32 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]] +; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0 +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0 +; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 +; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr +; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP32]], align 4 +; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 +; CHECK-NEXT: ret void +; + %a.val = load float, ptr %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, ptr %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + %1 = extractelement <4 x float> %av, i64 0 + %2 = extractelement <4 x float> %bv, i64 0 + %3 = extractelement <4 x float> %av, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float 0.000000e+00 + %8 = insertelement <4 x float> %av, float %7, i64 0 + %sr = extractelement <4 x float> %8, i32 0 + store float %sr, ptr %a + ret void +} + +define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { +; CHECK-LABEL: @fmadd_sd_mask_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 +; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] +; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast double [[TMP14]] to i64 +; CHECK-NEXT: [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]] +; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]] +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] +; CHECK: 29: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 30: +; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 +; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr +; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP33]], align 8 +; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 +; CHECK-NEXT: ret void +; + %a.val = load double, ptr %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, ptr %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + %1 = extractelement <2 x double> %av, i64 0 + %2 = extractelement <2 x double> %bv, i64 0 + %3 = extractelement <2 x double> %av, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double %1 + %8 = insertelement <2 x double> %av, double %7, i64 0 + %sr = extractelement <2 x double> %8, i32 0 + store double %sr, ptr %a + ret void +} + +define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { +; CHECK-LABEL: @fmadd_sd_maskz_memfold( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 +; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 +; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 +; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 +; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 +; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 +; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] +; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] +; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = xor i64 [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]] +; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00 +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 +; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0 +; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 +; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr +; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP32]], align 8 +; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 +; CHECK-NEXT: ret void +; + %a.val = load double, ptr %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, ptr %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + %1 = extractelement <2 x double> %av, i64 0 + %2 = extractelement <2 x double> %bv, i64 0 + %3 = extractelement <2 x double> %av, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double 0.000000e+00 + %8 = insertelement <2 x double> %av, double %7, i64 0 + %sr = extractelement <2 x double> %8, i32 0 + store double %sr, ptr %a + ret void +} + +define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast double [[TMP9]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] +; CHECK: 26: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 27: +; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] +; CHECK: 35: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 36: +; CHECK-NEXT: [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]] +; CHECK-NEXT: [[TMP43:%.*]] = bitcast double [[TMP37]] to i64 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast double [[TMP38]] to i64 +; CHECK-NEXT: [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]] +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %1 = fneg <2 x double> %x2 + %2 = extractelement <2 x double> %x0, i64 0 + %3 = extractelement <2 x double> %x1, i64 0 + %4 = extractelement <2 x double> %1, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = extractelement <2 x double> %x2, i64 0 + %7 = bitcast i8 %x3 to <8 x i1> + %8 = extractelement <8 x i1> %7, i64 0 + %9 = select i1 %8, double %5, double %6 + %10 = insertelement <2 x double> %x2, double %9, i64 0 + %11 = fneg <2 x double> %x2 + %12 = extractelement <2 x double> %x0, i64 0 + %13 = extractelement <2 x double> %x1, i64 0 + %14 = extractelement <2 x double> %11, i64 0 + %15 = call double @llvm.x86.avx512.vfmadd.f64(double %12, double %13, double %14, i32 11) + %16 = extractelement <2 x double> %x2, i64 0 + %17 = insertelement <2 x double> %x2, double %15, i64 0 + %18 = fneg <2 x double> %x2 + %19 = extractelement <2 x double> %x0, i64 0 + %20 = extractelement <2 x double> %x1, i64 0 + %21 = extractelement <2 x double> %18, i64 0 + %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 10) + %23 = extractelement <2 x double> %x2, i64 0 + %24 = bitcast i8 %x3 to <8 x i1> + %25 = extractelement <8 x i1> %24, i64 0 + %26 = select i1 %25, double %22, double %23 + %27 = insertelement <2 x double> %x2, double %26, i64 0 + %res3 = fadd <2 x double> %10, %17 + %res4 = fadd <2 x double> %27, %res3 + ret <2 x double> %res4 +} + +define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float [[TMP9]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] +; CHECK: 26: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 27: +; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] +; CHECK: 35: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 36: +; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]] +; CHECK-NEXT: [[TMP43:%.*]] = bitcast float [[TMP37]] to i32 +; CHECK-NEXT: [[TMP44:%.*]] = bitcast float [[TMP38]] to i32 +; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP45]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]] +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %1 = fneg <4 x float> %x2 + %2 = extractelement <4 x float> %x0, i64 0 + %3 = extractelement <4 x float> %x1, i64 0 + %4 = extractelement <4 x float> %1, i64 0 + %5 = call float @llvm.fma.f32(float %2, float %3, float %4) + %6 = extractelement <4 x float> %x2, i64 0 + %7 = bitcast i8 %x3 to <8 x i1> + %8 = extractelement <8 x i1> %7, i64 0 + %9 = select i1 %8, float %5, float %6 + %10 = insertelement <4 x float> %x2, float %9, i64 0 + %11 = fneg <4 x float> %x2 + %12 = extractelement <4 x float> %x0, i64 0 + %13 = extractelement <4 x float> %x1, i64 0 + %14 = extractelement <4 x float> %11, i64 0 + %15 = call float @llvm.x86.avx512.vfmadd.f32(float %12, float %13, float %14, i32 11) + %16 = extractelement <4 x float> %x2, i64 0 + %17 = insertelement <4 x float> %x2, float %15, i64 0 + %18 = fneg <4 x float> %x2 + %19 = extractelement <4 x float> %x0, i64 0 + %20 = extractelement <4 x float> %x1, i64 0 + %21 = extractelement <4 x float> %18, i64 0 + %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 10) + %23 = extractelement <4 x float> %x2, i64 0 + %24 = bitcast i8 %x3 to <8 x i1> + %25 = extractelement <8 x i1> %24, i64 0 + %26 = select i1 %25, float %22, float %23 + %27 = insertelement <4 x float> %x2, float %26, i64 0 + %res3 = fadd <4 x float> %10, %17 + %res4 = fadd <4 x float> %27, %res3 + ret <4 x float> %res4 +} + +define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast double [[TMP11]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = fneg <2 x double> [[X0]] +; CHECK-NEXT: [[TMP24:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = fneg <2 x double> [[X0]] +; CHECK-NEXT: [[TMP34:%.*]] = fneg <2 x double> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] +; CHECK: 38: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 39: +; CHECK-NEXT: [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]] +; CHECK-NEXT: [[TMP46:%.*]] = bitcast double [[TMP40]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = bitcast double [[TMP41]] to i64 +; CHECK-NEXT: [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = or i64 [[TMP48]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]] +; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x double> [[RES4]] +; + %1 = fneg <2 x double> %x0 + %2 = fneg <2 x double> %x2 + %3 = extractelement <2 x double> %1, i64 0 + %4 = extractelement <2 x double> %x1, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = extractelement <2 x double> %x2, i64 0 + %8 = bitcast i8 %x3 to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, double %6, double %7 + %11 = insertelement <2 x double> %x2, double %10, i64 0 + %12 = fneg <2 x double> %x0 + %13 = fneg <2 x double> %x2 + %14 = extractelement <2 x double> %12, i64 0 + %15 = extractelement <2 x double> %x1, i64 0 + %16 = extractelement <2 x double> %13, i64 0 + %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11) + %18 = extractelement <2 x double> %x2, i64 0 + %19 = insertelement <2 x double> %x2, double %17, i64 0 + %20 = fneg <2 x double> %x0 + %21 = fneg <2 x double> %x2 + %22 = extractelement <2 x double> %20, i64 0 + %23 = extractelement <2 x double> %x1, i64 0 + %24 = extractelement <2 x double> %21, i64 0 + %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 10) + %26 = extractelement <2 x double> %x2, i64 0 + %27 = bitcast i8 %x3 to <8 x i1> + %28 = extractelement <8 x i1> %27, i64 0 + %29 = select i1 %28, double %25, double %26 + %30 = insertelement <2 x double> %x2, double %29, i64 0 + %res3 = fadd <2 x double> %11, %19 + %res4 = fadd <2 x double> %30, %res3 + ret <2 x double> %res4 +} + +define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]] +; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] +; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]]) +; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP11]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]] +; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = fneg <4 x float> [[X0]] +; CHECK-NEXT: [[TMP24:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0 +; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 +; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] +; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 +; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] +; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] +; CHECK: 28: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 29: +; CHECK-NEXT: [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11) +; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0 +; CHECK-NEXT: [[TMP33:%.*]] = fneg <4 x float> [[X0]] +; CHECK-NEXT: [[TMP34:%.*]] = fneg <4 x float> [[X2]] +; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0 +; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0 +; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0 +; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 +; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 +; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] +; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 +; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] +; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] +; CHECK: 38: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 39: +; CHECK-NEXT: [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10) +; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]] +; CHECK-NEXT: [[TMP46:%.*]] = bitcast float [[TMP40]] to i32 +; CHECK-NEXT: [[TMP47:%.*]] = bitcast float [[TMP41]] to i32 +; CHECK-NEXT: [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]] +; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]] +; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0 +; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] +; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]] +; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] +; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[RES4]] +; + %1 = fneg <4 x float> %x0 + %2 = fneg <4 x float> %x2 + %3 = extractelement <4 x float> %1, i64 0 + %4 = extractelement <4 x float> %x1, i64 0 + %5 = extractelement <4 x float> %2, i64 0 + %6 = call float @llvm.fma.f32(float %3, float %4, float %5) + %7 = extractelement <4 x float> %x2, i64 0 + %8 = bitcast i8 %x3 to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %6, float %7 + %11 = insertelement <4 x float> %x2, float %10, i64 0 + %12 = fneg <4 x float> %x0 + %13 = fneg <4 x float> %x2 + %14 = extractelement <4 x float> %12, i64 0 + %15 = extractelement <4 x float> %x1, i64 0 + %16 = extractelement <4 x float> %13, i64 0 + %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11) + %18 = extractelement <4 x float> %x2, i64 0 + %19 = insertelement <4 x float> %x2, float %17, i64 0 + %20 = fneg <4 x float> %x0 + %21 = fneg <4 x float> %x2 + %22 = extractelement <4 x float> %20, i64 0 + %23 = extractelement <4 x float> %x1, i64 0 + %24 = extractelement <4 x float> %21, i64 0 + %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 10) + %26 = extractelement <4 x float> %x2, i64 0 + %27 = bitcast i8 %x3 to <8 x i1> + %28 = extractelement <8 x i1> %27, i64 0 + %29 = select i1 %28, float %25, float %26 + %30 = insertelement <4 x float> %x2, float %29, i64 0 + %res3 = fadd <4 x float> %11, %19 + %res4 = fadd <4 x float> %30, %res3 + ret <4 x float> %res4 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP12]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP24]] +; + %q = load float, ptr %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %vecinit.i, i64 0 + %3 = extractelement <4 x float> %x1, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %3 + %8 = insertelement <4 x float> %x1, float %7, i64 0 + ret <4 x float> %8 +} + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP10]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP24]] +; + %q = load float, ptr %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %vecinit.i, i64 0 + %3 = extractelement <4 x float> %x1, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %1 + %8 = insertelement <4 x float> %x0, float %7, i64 0 + ret <4 x float> %8 +} + + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { +; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 5: +; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 +; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 +; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 +; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP14]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]] +; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], 0 +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 +; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP19]] +; + %q = load float, ptr %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %vecinit.i, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = select i1 false, float %4, float 0.000000e+00 + %6 = insertelement <4 x float> %x0, float %5, i64 0 + ret <4 x float> %6 +} + +define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psll_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psll_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psll_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psll_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psll_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psll_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone + + +define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_pslli_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_pslli_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone + + +define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_pslli_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_pslli_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psra_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psra_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psra_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psra_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psra_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psra_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone + + + +define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrai_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrai_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrai_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrai_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone + + + +define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrl_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrl_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrl_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrl_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrli_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + ret <16 x i32> %res +} +define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrli_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru + ret <16 x i32> %res2 +} +define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} +declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone + + +define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) #0 { +; CHECK-LABEL: @test_x86_avx512_psrli_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + ret <8 x i64> %res +} +define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrli_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru + ret <8 x i64> %res2 +} +define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} +declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone + +define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psllv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_psllv_d_512_const() #0 { +; CHECK-LABEL: @test_x86_avx512_psllv_d_512_const( +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) +; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) + %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) + %res2 = add <16 x i32> %res0, %res1 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psllv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone + +define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psllv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_psllv_q_512_const() #0 { +; CHECK-LABEL: @test_x86_avx512_psllv_q_512_const( +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) +; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) + %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) + %res2 = add <8 x i64> %res0, %res1 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psllv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrav_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrav_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrav_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrav_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrlv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_psrlv_d_512_const() #0 { +; CHECK-LABEL: @test_x86_avx512_psrlv_d_512_const( +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) +; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) + %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) + %res2 = add <16 x i32> %res0, %res1 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 + ret <16 x i32> %res2 +} + +define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[RES2]] +; + %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) + %mask.cast = bitcast i16 %mask to <16 x i1> + %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { +; CHECK-LABEL: @test_x86_avx512_psrlv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_psrlv_q_512_const() #0 { +; CHECK-LABEL: @test_x86_avx512_psrlv_q_512_const( +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) +; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) +; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) + %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) + %res2 = add <8 x i64> %res0, %res1 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]] +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 + ret <8 x i64> %res2 +} + +define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { +; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] +; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer +; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES2]] +; + %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) + %mask.cast = bitcast i8 %mask to <8 x i1> + %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone + + +define <8 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind #0 { +; CHECK-LABEL: @test_mm256_castpd128_pd256_freeze( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[A1:%.*]] = freeze <2 x double> poison +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> [[A1]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %a1 = freeze <2 x double> poison + %res = shufflevector <2 x double> %a0, <2 x double> %a1, <8 x i32> + ret <8 x double> %res +} + + +define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwind #0 { +; CHECK-LABEL: @test_mm256_castpd256_pd256_freeze( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[A1:%.*]] = freeze <4 x double> poison +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> [[A1]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x double> [[RES]] +; + %a1 = freeze <4 x double> poison + %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> + ret <8 x double> %res +} + + +define <16 x float> @test_mm256_castps128_ps512_freeze(<4 x float> %a0) nounwind #0 { +; CHECK-LABEL: @test_mm256_castps128_ps512_freeze( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[A1:%.*]] = freeze <4 x float> poison +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %a1 = freeze <4 x float> poison + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <16x i32> + ret <16 x float> %res +} + + +define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind #0 { +; CHECK-LABEL: @test_mm256_castps256_ps512_freeze( +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[A1:%.*]] = freeze <8 x float> poison +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[RES:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> [[A1]], <16 x i32> +; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[RES]] +; + %a1 = freeze <8 x float> poison + %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> + ret <16 x float> %res +} + + +define <8 x i64> @test_mm512_castsi128_si512_freeze(<2 x i64> %a0) nounwind #0 { +; CHECK-LABEL: @test_mm512_castsi128_si512_freeze( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[A1:%.*]] = freeze <2 x i64> poison +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %a1 = freeze <2 x i64> poison + %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <8 x i32> + ret <8 x i64> %res +} + + +define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwind #0 { +; CHECK-LABEL: @test_mm512_castsi256_si512_pd256_freeze( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[A1:%.*]] = freeze <4 x i64> poison +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> [[A1]], <8 x i32> +; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[RES]] +; + %a1 = freeze <4 x i64> poison + %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> + ret <8 x i64> %res +} + + +define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 { +; CHECK-LABEL: @bad_mask_transition( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP10]] to i8 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[C:%.*]], <8 x double> [[D:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i1> [[TMP16]] to i8 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i16 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP17]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[CONV]] to <16 x i1> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[CONV2]] to <16 x i1> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP18]], <16 x i1> undef, <8 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i1> [[TMP19]], <16 x i1> undef, <8 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = or <16 x i32> [[TMP26]], [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = or <16 x i32> [[TMP27]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP28]], <16 x i32> [[TMP23]] +; CHECK-NEXT: [[TMP29:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[F]], <16 x float> [[E]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP29]] +; +entry: + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> , i32 4) + %1 = bitcast <8 x i1> %0 to i8 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> , i32 4) + %3 = bitcast <8 x i1> %2 to i8 + %conv = zext i8 %1 to i16 + %conv2 = zext i8 %3 to i16 + %4 = bitcast i16 %conv to <16 x i1> + %5 = bitcast i16 %conv2 to <16 x i1> + %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> + %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> + %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> + %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e + ret <16 x float> %9 +} + +define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 { +; CHECK-LABEL: @bad_mask_transition_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] +; CHECK-NEXT: unreachable +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP8]] to i8 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP9]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[CONV]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP16]], <16 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[F]], <16 x float> [[E]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP17]] +; +entry: + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> , i32 4) + %1 = bitcast <8 x i1> %0 to i8 + %conv = zext i8 %1 to i16 + %2 = bitcast i16 %conv to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e + ret <16 x float> %3 +} + +declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>) +declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>) +declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>) +declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>) +declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>) +declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>) +declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>) +declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>) + +attributes #0 = { sanitize_memory } From 84af3ee5124de3385b829c3a9980fd734f0d92e8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 16:13:51 -0800 Subject: [PATCH 153/432] [ELF] Replace Fatal with Err --- lld/ELF/Arch/ARM.cpp | 4 ++-- lld/ELF/Arch/RISCV.cpp | 2 +- lld/ELF/InputFiles.cpp | 34 ++++++++++++++++++++++------------ 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index de6e45c6cc65c..7d2953ddf64f0 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1536,8 +1536,8 @@ template void elf::writeARMCmseImportLib(Ctx &ctx) { } if (auto e = buffer->commit()) - Fatal(ctx) << "failed to write output '" << buffer->getPath() - << "': " << std::move(e); + Err(ctx) << "failed to write output '" << buffer->getPath() + << "': " << std::move(e); } void elf::setARMTargetInfo(Ctx &ctx) { ctx.target.reset(new ARM(ctx)); } diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 36ae31be6ed2a..4d8989a21b501 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -885,7 +885,7 @@ static bool relax(Ctx &ctx, InputSection &sec) { } // Inform assignAddresses that the size has changed. if (!isUInt<32>(delta)) - Fatal(ctx) << "section size decrease is too large: " << delta; + Err(ctx) << "section size decrease is too large: " << delta; sec.bytesDropped = delta; return changed; } diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index eba4c234d3f16..42d0e4c202ec6 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -1131,8 +1131,8 @@ void ObjFile::initializeSymbols(const object::ELFFile &obj) { sym->isUsedInRegularObj = true; if (LLVM_UNLIKELY(eSym.st_shndx == SHN_COMMON)) { if (value == 0 || value >= UINT32_MAX) - Fatal(ctx) << this << ": common symbol '" << sym->getName() - << "' has invalid alignment: " << value; + Err(ctx) << this << ": common symbol '" << sym->getName() + << "' has invalid alignment: " << value; hasCommonSyms = true; sym->resolve(ctx, CommonSymbol{ctx, this, StringRef(), binding, stOther, type, value, size}); @@ -1384,16 +1384,22 @@ std::vector SharedFile::parseVerneed(const ELFFile &obj, ArrayRef data = CHECK2(obj.getSectionContents(*sec), this); const uint8_t *verneedBuf = data.begin(); for (unsigned i = 0; i != sec->sh_info; ++i) { - if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end()) - Fatal(ctx) << this << " has an invalid Verneed"; + if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end()) { + Err(ctx) << this << " has an invalid Verneed"; + break; + } auto *vn = reinterpret_cast(verneedBuf); const uint8_t *vernauxBuf = verneedBuf + vn->vn_aux; for (unsigned j = 0; j != vn->vn_cnt; ++j) { - if (vernauxBuf + sizeof(typename ELFT::Vernaux) > data.end()) - Fatal(ctx) << this << " has an invalid Vernaux"; + if (vernauxBuf + sizeof(typename ELFT::Vernaux) > data.end()) { + Err(ctx) << this << " has an invalid Vernaux"; + break; + } auto *aux = reinterpret_cast(vernauxBuf); - if (aux->vna_name >= this->stringTable.size()) - Fatal(ctx) << this << " has a Vernaux with an invalid vna_name"; + if (aux->vna_name >= this->stringTable.size()) { + Err(ctx) << this << " has a Vernaux with an invalid vna_name"; + break; + } uint16_t version = aux->vna_other & VERSYM_VERSION; if (version >= verneeds.size()) verneeds.resize(version + 1); @@ -1481,13 +1487,17 @@ template void SharedFile::parse() { for (const Elf_Dyn &dyn : dynamicTags) { if (dyn.d_tag == DT_NEEDED) { uint64_t val = dyn.getVal(); - if (val >= this->stringTable.size()) - Fatal(ctx) << this << ": invalid DT_NEEDED entry"; + if (val >= this->stringTable.size()) { + Err(ctx) << this << ": invalid DT_NEEDED entry"; + return; + } dtNeeded.push_back(this->stringTable.data() + val); } else if (dyn.d_tag == DT_SONAME) { uint64_t val = dyn.getVal(); - if (val >= this->stringTable.size()) - Fatal(ctx) << this << ": invalid DT_SONAME entry"; + if (val >= this->stringTable.size()) { + Err(ctx) << this << ": invalid DT_SONAME entry"; + return; + } soName = this->stringTable.data() + val; } } From a6044a05cd16d2c5dbca80757a160cba9a2cb037 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Sun, 26 Jan 2025 16:59:34 -0800 Subject: [PATCH 154/432] [msan] Fix-forward avx512-intrinsics-upgrade.ll (#124495) I had added the test in https://github.com/llvm/llvm-project/pull/123980 and contemporaneously added AVX masked store/load intrinsics (https://github.com/llvm/llvm-project/pull/123857) and forgot to update the test output for the intersection. This patch fixes the output. --- .../X86/avx512-intrinsics-upgrade.ll | 184 +++++------------- 1 file changed, 44 insertions(+), 140 deletions(-) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll index edb618fdfb8fb..1ab13a1f1bfeb 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll @@ -16542,23 +16542,15 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16576,23 +16568,15 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16611,24 +16595,16 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16647,24 +16623,16 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16680,18 +16648,10 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) @@ -16703,18 +16663,10 @@ define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) @@ -16729,23 +16681,15 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16763,23 +16707,15 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16798,24 +16734,16 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16834,24 +16762,16 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16867,18 +16787,10 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) @@ -16890,18 +16802,10 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) From b9d301cc7e4fe4c442ec15169686fa4a18f5cdfc Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 27 Jan 2025 01:10:35 +0000 Subject: [PATCH 155/432] Revert "[msan] Add handlers for AVX masked load/store intrinsics (#123857)" This reverts commit db79fb2a91df31a07f312f8e061936927ac5c506. Reason: buildbot breakage (https://lab.llvm.org/buildbot/#/builders/144/builds/16636/steps/6/logs/FAIL__LLVM__avx512-intrinsics-upgrade_ll) --- .../Instrumentation/MemorySanitizer.cpp | 154 +---------------- .../MemorySanitizer/X86/avx-intrinsics-x86.ll | 160 ++++++++---------- .../X86/avx2-intrinsics-x86.ll | 152 ++++++++--------- .../i386/avx-intrinsics-i386.ll | 160 ++++++++---------- .../i386/avx2-intrinsics-i386.ll | 152 ++++++++--------- 5 files changed, 289 insertions(+), 489 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b6293af4ab477..56d3eb10d73e9 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3046,8 +3046,7 @@ struct MemorySanitizerVisitor : public InstVisitor { if (maybeHandleSimpleNomemIntrinsic(I)) return true; - // FIXME: detect and handle SSE maskstore/maskload? - // Some cases are now handled in handleAVXMasked{Load,Store}. + // FIXME: detect and handle SSE maskstore/maskload return false; } @@ -3684,10 +3683,6 @@ struct MemorySanitizerVisitor : public InstVisitor { // TODO: Store origin. } - // Intrinsic::masked_store - // - // Note: handleAVXMaskedStore handles AVX/AVX2 variants, though AVX512 masked - // stores are lowered to Intrinsic::masked_store. void handleMaskedStore(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *V = I.getArgOperand(0); @@ -3718,10 +3713,6 @@ struct MemorySanitizerVisitor : public InstVisitor { std::max(Alignment, kMinOriginAlignment)); } - // Intrinsic::masked_load - // - // Note: handleAVXMaskedLoad handles AVX/AVX2 variants, though AVX512 masked - // loads are lowered to Intrinsic::masked_load. void handleMaskedLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Ptr = I.getArgOperand(0); @@ -3763,125 +3754,6 @@ struct MemorySanitizerVisitor : public InstVisitor { setOrigin(&I, Origin); } - // e.g., void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) - // dst mask src - // - // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled - // by handleMaskedStore. - // - // This function handles AVX and AVX2 masked stores; these use the MSBs of a - // vector of integers, unlike the LLVM masked intrinsics, which require a - // vector of booleans. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad - // mentions that the x86 backend does not know how to efficiently convert - // from a vector of booleans back into the AVX mask format; therefore, they - // (and we) do not reduce AVX/AVX2 masked intrinsics into LLVM masked - // intrinsics. - void handleAVXMaskedStore(IntrinsicInst &I) { - IRBuilder<> IRB(&I); - - Value *Dst = I.getArgOperand(0); - assert(Dst->getType()->isPointerTy() && "Destination is not a pointer!"); - - Value *Mask = I.getArgOperand(1); - assert(isa(Mask->getType()) && "Mask is not a vector!"); - - Value *Src = I.getArgOperand(2); - assert(isa(Src->getType()) && "Source is not a vector!"); - - const Align Alignment = Align(1); - - Value *SrcShadow = getShadow(Src); - - if (ClCheckAccessAddress) { - insertShadowCheck(Dst, &I); - insertShadowCheck(Mask, &I); - } - - Value *DstShadowPtr; - Value *DstOriginPtr; - std::tie(DstShadowPtr, DstOriginPtr) = getShadowOriginPtr( - Dst, IRB, SrcShadow->getType(), Alignment, /*isStore*/ true); - - SmallVector ShadowArgs; - ShadowArgs.append(1, DstShadowPtr); - ShadowArgs.append(1, Mask); - // The intrinsic may require floating-point but shadows can be arbitrary - // bit patterns, of which some would be interpreted as "invalid" - // floating-point values (NaN etc.); we assume the intrinsic will happily - // copy them. - ShadowArgs.append(1, IRB.CreateBitCast(SrcShadow, Src->getType())); - - CallInst *CI = - IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), ShadowArgs); - setShadow(&I, CI); - - if (!MS.TrackOrigins) - return; - - // Approximation only - auto &DL = F.getDataLayout(); - paintOrigin(IRB, getOrigin(Src), DstOriginPtr, - DL.getTypeStoreSize(SrcShadow->getType()), - std::max(Alignment, kMinOriginAlignment)); - } - - // e.g., <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) - // return src mask - // - // Masked-off values are replaced with 0, which conveniently also represents - // initialized memory. - // - // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled - // by handleMaskedStore. - // - // We do not combine this with handleMaskedLoad; see comment in - // handleAVXMaskedStore for the rationale. - // - // This is subtly different than handleIntrinsicByApplyingToShadow(I, 1) - // because we need to apply getShadowOriginPtr, not getShadow, to the first - // parameter. - void handleAVXMaskedLoad(IntrinsicInst &I) { - IRBuilder<> IRB(&I); - - Value *Src = I.getArgOperand(0); - assert(Src->getType()->isPointerTy() && "Source is not a pointer!"); - - Value *Mask = I.getArgOperand(1); - assert(isa(Mask->getType()) && "Mask is not a vector!"); - - const Align Alignment = Align(1); - - if (ClCheckAccessAddress) { - insertShadowCheck(Mask, &I); - } - - Type *SrcShadowTy = getShadowTy(Src); - Value *SrcShadowPtr, *SrcOriginPtr; - std::tie(SrcShadowPtr, SrcOriginPtr) = - getShadowOriginPtr(Src, IRB, SrcShadowTy, Alignment, /*isStore*/ false); - - SmallVector ShadowArgs; - ShadowArgs.append(1, SrcShadowPtr); - ShadowArgs.append(1, Mask); - - CallInst *CI = - IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), ShadowArgs); - // The intrinsic may require floating-point but shadows can be arbitrary - // bit patterns, of which some would be interpreted as "invalid" - // floating-point values (NaN etc.); we assume the intrinsic will happily - // copy them. - setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I))); - - if (!MS.TrackOrigins) - return; - - // The "pass-through" value is always zero (initialized). To the extent - // that that results in initialized aligned 4-byte chunks, the origin value - // is ignored. It is therefore correct to simply copy the origin from src. - Value *PtrSrcOrigin = IRB.CreateLoad(MS.OriginTy, SrcOriginPtr); - setOrigin(&I, PtrSrcOrigin); - } - // Instrument BMI / BMI2 intrinsics. // All of these intrinsics are Z = I(X, Y) // where the types of all operands and the result match, and are either i32 or @@ -4594,30 +4466,6 @@ struct MemorySanitizerVisitor : public InstVisitor { break; } - case Intrinsic::x86_avx_maskstore_ps: - case Intrinsic::x86_avx_maskstore_pd: - case Intrinsic::x86_avx_maskstore_ps_256: - case Intrinsic::x86_avx_maskstore_pd_256: - case Intrinsic::x86_avx2_maskstore_d: - case Intrinsic::x86_avx2_maskstore_q: - case Intrinsic::x86_avx2_maskstore_d_256: - case Intrinsic::x86_avx2_maskstore_q_256: { - handleAVXMaskedStore(I); - break; - } - - case Intrinsic::x86_avx_maskload_ps: - case Intrinsic::x86_avx_maskload_pd: - case Intrinsic::x86_avx_maskload_ps_256: - case Intrinsic::x86_avx_maskload_pd_256: - case Intrinsic::x86_avx2_maskload_d: - case Intrinsic::x86_avx2_maskload_q: - case Intrinsic::x86_avx2_maskload_d_256: - case Intrinsic::x86_avx2_maskload_q_256: { - handleAVXMaskedLoad(I); - break; - } - case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll index 43f51a810d0d2..7273e431a9c2a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll @@ -532,22 +532,20 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP4]], <2 x i64> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <2 x i64> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]]) -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[RES]] ; %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1] @@ -558,22 +556,20 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP4]], <4 x i64> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x double> [[TMP5]] to <4 x i64> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]]) -; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[RES]] ; %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1] @@ -584,22 +580,20 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP4]], <4 x i32> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]]) -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[RES]] ; %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1] @@ -610,22 +604,20 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP4]], <8 x i32> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x float> [[TMP5]] to <8 x i32> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]]) -; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x float> [[RES]] ; %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1] @@ -636,25 +628,23 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd( -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[TMP6]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) @@ -665,25 +655,23 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP6]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) @@ -694,25 +682,23 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[TMP6]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) @@ -723,25 +709,23 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP6]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP7]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll index c68461dd367ee..e10062142c046 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll @@ -995,21 +995,20 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP4]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[RES]] ; %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] @@ -1020,21 +1019,20 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP4]], <4 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]]) -; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i64> [[RES]] ; %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] @@ -1045,21 +1043,20 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP4]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] @@ -1070,21 +1067,20 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP4]], <8 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] +; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]]) -; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK: 5: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] @@ -1095,24 +1091,23 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q( -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[TMP6]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) @@ -1123,24 +1118,23 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP6]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) @@ -1151,24 +1145,23 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[TMP6]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) @@ -1179,24 +1172,23 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP6]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]]) +; CHECK: 7: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll index a22ca6dd15da4..68337d6d962db 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll @@ -550,23 +550,21 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP11]], <2 x i64> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <2 x i64> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]]) -; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[RES]] ; %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1] @@ -577,23 +575,21 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_pd_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP11]], <4 x i64> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]]) -; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[RES]] ; %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1] @@ -604,23 +600,21 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP11]], <4 x i32> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <4 x i32> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]]) -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[RES]] ; %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1] @@ -631,23 +625,21 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 { ; CHECK-LABEL: @test_x86_avx_maskload_ps_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP11]], <8 x i32> [[MASK:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x float> [[TMP6]] to <8 x i32> +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]]) -; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x float> [[RES]] ; %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1] @@ -658,26 +650,24 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd( -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[TMP7]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) @@ -688,26 +678,24 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP7]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) @@ -718,26 +706,24 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[TMP7]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) @@ -748,26 +734,24 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 { ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float> -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP7]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP8]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() ; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll index 442f0c422645a..29e2931d2ca48 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll @@ -1048,22 +1048,21 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 -; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP10]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]]) +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[RES]] ; %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] @@ -1074,22 +1073,21 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_q_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 -; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP10]], <4 x i64> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]]) -; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]]) +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i64> [[RES]] ; %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] @@ -1100,22 +1098,21 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 -; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP10]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] @@ -1126,22 +1123,21 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 { ; CHECK-LABEL: @test_x86_avx2_maskload_d_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649 -; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP10]], <8 x i32> [[A1:%.*]]) +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]]) -; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK: 6: +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] @@ -1152,25 +1148,24 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q( -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[TMP7]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) @@ -1181,25 +1176,24 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP7]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) @@ -1210,25 +1204,24 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d( -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[TMP7]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) @@ -1239,25 +1232,24 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 { ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256( -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP7]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]]) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256 ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]]) +; CHECK: 8: +; CHECK-NEXT: call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]]) ; CHECK-NEXT: ret void ; call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) From b6eeec586fa6c0db4ab1b0e129111e82a97c7283 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 27 Jan 2025 01:10:51 +0000 Subject: [PATCH 156/432] Revert "[msan] Fix-forward avx512-intrinsics-upgrade.ll (#124495)" This reverts commit a6044a05cd16d2c5dbca80757a160cba9a2cb037. Reason: buildbot breakage (https://lab.llvm.org/buildbot/#/builders/144/builds/16636/steps/6/logs/FAIL__LLVM__avx512-intrinsics-upgrade_ll) --- .../X86/avx512-intrinsics-upgrade.ll | 184 +++++++++++++----- 1 file changed, 140 insertions(+), 44 deletions(-) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll index 1ab13a1f1bfeb..edb618fdfb8fb 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll @@ -16542,15 +16542,23 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16568,15 +16576,23 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16595,16 +16611,24 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16623,16 +16647,24 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16648,10 +16680,18 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) @@ -16663,10 +16703,18 @@ define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) @@ -16681,15 +16729,23 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16707,15 +16763,23 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] +; CHECK: 6: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 7: ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer @@ -16734,16 +16798,24 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16762,16 +16834,24 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 8: ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] @@ -16787,10 +16867,18 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) @@ -16802,10 +16890,18 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] +; CHECK-NEXT: unreachable +; CHECK: 6: ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[TMP7]] ; %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) From b2647ffbf797dd5a457b6b19faab06956934d067 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Mon, 27 Jan 2025 01:35:32 +0000 Subject: [PATCH 157/432] Revert "[msan] Add avx512-intrinsics.ll and avx512-intrinsics-upgrade.ll test case (#123980)" This reverts commit 980e86f130eea02bd41b887f4ed896340fc90f6c. Reason: buildbot breakage (https://lab.llvm.org/buildbot/#/builders/154/builds/10901/steps/5/logs/FAIL__LLVM__avx512-intrinsics-upgrade_ll) --- .../X86/avx512-intrinsics-upgrade.ll | 19969 ---------------- .../MemorySanitizer/X86/avx512-intrinsics.ll | 13714 ----------- 2 files changed, 33683 deletions(-) delete mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll delete mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll deleted file mode 100644 index edb618fdfb8fb..0000000000000 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll +++ /dev/null @@ -1,19969 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s -; -; Forked from llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll - -declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone - -define i16 @unpckbw_test(i16 %a0, i16 %a1) #0 { -; -; CHECK-LABEL: @unpckbw_test( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i1> [[TMP3]], <16 x i1> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i1> [[TMP6]], <16 x i1> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[_MSPROP1]], <8 x i1> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP2]] to i16 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: store i16 [[TMP10]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP11]] -; - %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) - ret i16 %res -} - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_gpr_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = shufflevector <16 x i32> [[_MSPROP6]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP7]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DOTSPLAT2]], [[X1:%.*]] -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP7]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[DOTSPLAT2]], <16 x i32> [[X1]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = shufflevector <16 x i32> [[_MSPROP8]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT3]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[DOTSPLAT4]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP9]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT10:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[DOTSPLAT4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP5]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[DOTSPLAT]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP18]], <16 x i32> [[_MSPROP_SELECT]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP10]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP19]], <16 x i32> [[_MSPROP_SELECT10]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP17]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP20]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} -declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) - - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_gpr_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = shufflevector <8 x i64> [[_MSPROP6]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP7]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[DOTSPLAT2]], [[X1:%.*]] -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP7]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[DOTSPLAT2]], <8 x i64> [[X1]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = shufflevector <8 x i64> [[_MSPROP8]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP9]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[DOTSPLAT4]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP9]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT10:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[DOTSPLAT4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP5]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[DOTSPLAT]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP18]], <8 x i64> [[_MSPROP_SELECT]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP10]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP19]], <8 x i64> [[_MSPROP_SELECT10]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP17]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP20]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} -declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) - - -declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly - -define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_x86_vbroadcast_ss_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1) - ret <16 x float> %res -} - -define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) #0 { -; -; CHECK-LABEL: @test_x86_mask_vbroadcast_ss_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[A1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[A1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) - ret <16 x float> %res -} - -define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %mask ) #0 { -; -; CHECK-LABEL: @test_x86_maskz_vbroadcast_ss_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly - -define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_vbroadcast_sd_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1) - ret <8 x double> %res -} - -define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) #0 { -; -; CHECK-LABEL: @test_x86_mask_vbroadcast_sd_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[A1:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[A1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) - ret <8 x double> %res -} - -define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %mask ) #0 { -; -; CHECK-LABEL: @test_x86_maskz_vbroadcast_sd_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) - ret <8 x double> %res -} - -declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pbroadcastd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pbroadcastq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) - ret <8 x i64> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_movsldup_512(<16 x float> %x0, <16 x float> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_movsldup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_movsldup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_maskz_movsldup_512(<16 x float> %x0, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_movsldup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_movshdup_512(<16 x float> %x0, <16 x float> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_movshdup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_movshdup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_maskz_movshdup_512(<16 x float> %x0, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_movshdup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_movddup_512(<8 x double> %x0, <8 x double> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_movddup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_movddup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_movddup_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) - ret <8 x double> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_perm_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_perm_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) - ret <8 x double> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_perm_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_perm_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) - ret <8 x i64> %res -} - -define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_store1( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1 -; CHECK-NEXT: store <16 x float> [[DATA]], ptr [[PTR2]], align 1 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr, <16 x float> %data, i16 %mask) - call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr2, <16 x float> %data, i16 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.storeu.ps.512(ptr, <16 x float>, i16 ) - -define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_store2( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 1, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1 -; CHECK-NEXT: store <8 x double> [[DATA]], ptr [[PTR2]], align 1 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr, <8 x double> %data, i8 %mask) - call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr2, <8 x double> %data, i8 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.storeu.pd.512(ptr, <8 x double>, i8) - -define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_store_aligned_ps( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 64, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64 -; CHECK-NEXT: store <16 x float> [[DATA]], ptr [[PTR2]], align 64 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr, <16 x float> %data, i16 %mask) - call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr2, <16 x float> %data, i16 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.store.ps.512(ptr, <16 x float>, i16 ) - -define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_store_aligned_pd( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 64, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64 -; CHECK-NEXT: store <8 x double> [[DATA]], ptr [[PTR2]], align 64 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr, <8 x double> %data, i8 %mask) - call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr2, <8 x double> %data, i8 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.store.pd.512(ptr, <8 x double>, i8) - -define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 1, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1 -; CHECK-NEXT: store <8 x i64> [[X1]], ptr [[PTR2]], align 1 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2) - call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.storeu.q.512(ptr, <8 x i64>, i8) - -define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 1, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1 -; CHECK-NEXT: store <16 x i32> [[X1]], ptr [[PTR2]], align 1 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2) - call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.storeu.d.512(ptr, <16 x i32>, i16) - -define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_store_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 64, <8 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64 -; CHECK-NEXT: store <8 x i64> [[X1]], ptr [[PTR2]], align 64 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2) - call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.store.q.512(ptr, <8 x i64>, i8) - -define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_store_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 64, <16 x i1> [[TMP6]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr -; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64 -; CHECK-NEXT: store <16 x i32> [[X1]], ptr [[PTR2]], align 64 -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2) - call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1) - ret void -} - -declare void @llvm.x86.avx512.mask.store.d.512(ptr, <16 x i32>, i16) - -define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_aligned_ps( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x float> [[TMP5]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] -; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 25: -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x float> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES4]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1) - %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> %res, i16 %mask) - %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res2, %res1 - ret <16 x float> %res4 -} - -declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr, <16 x float>, i16) - -define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_unaligned_ps( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 1, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP10]], <16 x float> [[TMP5]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 1, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] -; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 25: -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP19]], <16 x float> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES4]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1) - %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> %res, i16 %mask) - %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res2, %res1 - ret <16 x float> %res4 -} - -declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr, <16 x float>, i16) - -define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_aligned_pd( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x double> [[TMP5]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] -; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 25: -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x double> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES4]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1) - %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> %res, i8 %mask) - %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res2, %res1 - ret <8 x double> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr, <8 x double>, i8) - -define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_unaligned_pd( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 1, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP10]], <8 x double> [[TMP5]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 1, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] -; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 25: -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP19]], <8 x double> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES4]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1) - %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> %res, i8 %mask) - %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res2, %res1 - ret <8 x double> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr, <8 x double>, i8) - -declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr, <16 x i32>, i16) - -define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_unaligned_d( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 -; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] -; CHECK: 16: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR2]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[TMP6]]) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 -; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP19]] to i16 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP24]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] -; CHECK: 25: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 26: -; CHECK-NEXT: [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i32> [[TMP27]], [[TMP18]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr2, <16 x i32> %res, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask) - %res4 = add <16 x i32> %res2, %res1 - ret <16 x i32> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr, <8 x i64>, i8) - -define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_unaligned_q( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 -; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP14]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i1> [[TMP10]] to i8 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] -; CHECK: 16: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR2]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[TMP6]]) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 -; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP23]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i1> [[TMP19]] to i8 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP24]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]] -; CHECK: 25: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 26: -; CHECK-NEXT: [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = add <8 x i64> [[TMP27]], [[TMP18]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr2, <8 x i64> %res, i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res2, %res1 - ret <8 x i64> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr, <16 x i32>, i16) - -define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_aligned_d( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[TMP5]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] -; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 25: -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i32> [[TMP26]], [[TMP17]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> %res, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask) - %res4 = add <16 x i32> %res2, %res1 - ret <16 x i32> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr, <8 x i64>, i8) - -define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_load_aligned_q( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]]) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[TMP5]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr -; CHECK-NEXT: [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]] -; CHECK: 24: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 25: -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]] -; CHECK-NEXT: [[RES4:%.*]] = add <8 x i64> [[TMP26]], [[TMP17]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> %res, i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res2, %res1 - ret <8 x i64> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermil_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermil_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) - ret <16 x float> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pshuf_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X2:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pshuf_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) - ret <16 x i32> %res -} - -define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_pcmpeq_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: store i16 [[TMP10]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP11]] -; - %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) - ret i16 %res -} - -define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_pcmpeq_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1) -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 -; CHECK-NEXT: store i16 [[TMP19]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP20]] -; - %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) - ret i16 %res -} - -declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16) - -define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_pcmpeq_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[A]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 -; CHECK-NEXT: store i8 [[TMP10]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[TMP11]] -; - %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) - ret i8 %res -} - -define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_pcmpeq_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1) -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[A]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 -; CHECK-NEXT: store i8 [[TMP19]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[TMP20]] -; - %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) - ret i8 %res -} - -declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8) - -define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_pcmpgt_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <16 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt <16 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i1> [[TMP13]] to i16 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 -; CHECK-NEXT: store i16 [[TMP15]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP16]] -; - %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) - ret i16 %res -} - -define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_pcmpgt_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP8]], [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt <16 x i32> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i1> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <16 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16 -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 -; CHECK-NEXT: store i16 [[TMP24]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP25]] -; - %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) - ret i16 %res -} - -declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16) - -define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_pcmpgt_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i1> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i64> [[A]], [[B]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i1> [[TMP13]] to i8 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i1> [[TMP14]] to i8 -; CHECK-NEXT: store i8 [[TMP15]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[TMP16]] -; - %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) - ret i8 %res -} - -define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_pcmpgt_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt <8 x i64> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <8 x i64> [[A]], [[B]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP14]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i1> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i1> [[TMP14]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i1> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i1> [[TMP21]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i1> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8 -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8 -; CHECK-NEXT: store i8 [[TMP24]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[TMP25]] -; - %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) - ret i8 %res -} - -declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) - -declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_unpckh_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP14]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_unpckh_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP14]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_unpckl_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP14]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_unpckl_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP14]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_punpcklqd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklqd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_punpcklqd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_punpckhqd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhqd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_punpckhd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_punpckld_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_punpckld_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_pslli_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_pslli_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP13]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_pslli_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_pslli_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP13]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone - -define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrli_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrli_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP13]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrli_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrli_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP13]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone - -define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrai_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrai_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP13]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrai_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrai_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP13]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone - -declare void @llvm.x86.avx512.storent.q.512(ptr, <8 x i64>) - -define void@test_storent_q_512(<8 x i64> %data, ptr %ptr) #0 { -; -; CHECK-LABEL: @test_storent_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64 -; CHECK-NEXT: store <8 x i64> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2:![0-9]+]] -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.storent.q.512(ptr %ptr, <8 x i64> %data) - ret void -} - -declare void @llvm.x86.avx512.storent.pd.512(ptr, <8 x double>) - -define void @test_storent_pd_512(<8 x double> %data, ptr %ptr) #0 { -; -; CHECK-LABEL: @test_storent_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64 -; CHECK-NEXT: store <8 x double> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]] -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.storent.pd.512(ptr %ptr, <8 x double> %data) - ret void -} - -declare void @llvm.x86.avx512.storent.ps.512(ptr, <16 x float>) - -define void @test_storent_ps_512(<16 x float> %data, ptr %ptr) #0 { -; -; CHECK-LABEL: @test_storent_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: store <16 x i32> [[TMP2]], ptr [[TMP7]], align 64 -; CHECK-NEXT: store <16 x float> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]] -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.storent.ps.512(ptr %ptr, <16 x float> %data) - ret void -} - -define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_xor_epi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_xor_epi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_or_epi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1) -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1) -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[A]], [[B]] -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_or_epi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1) -; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP19]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_and_epi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[A]], [[B]] -; CHECK-NEXT: store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_and_epi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i32> [[A]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP9]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP9]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_xor_epi64( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_xor_epi64( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_or_epi64( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1) -; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1) -; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[A]], [[B]] -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_or_epi64( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1) -; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[A]], [[B]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_and_epi64( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[A]], [[B]] -; CHECK-NEXT: store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP8]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_and_epi64( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[A]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP9]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP9]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mask_add_epi32_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi32_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mask_sub_epi32_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi32_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_mask_add_epi64_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP8]] -; - %b = load <8 x i64>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %b = load <8 x i64>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %b = load <8 x i64>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP8]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_add_epi64_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_mask_sub_epi64_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP8]] -; - %b = load <8 x i64>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %b = load <8 x i64>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %b = load <8 x i64>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP8]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sub_epi64_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mask_mullo_epi32_rr_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rrk_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rrkz_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rm_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rmk_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rmkz_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rmb_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rmbk_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) - ret < 16 x i32> %res -} - -define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mullo_epi32_rmbkz_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer -; CHECK-NEXT: [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %q = load i32, ptr %ptr_b - %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 - %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask) - ret < 16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - - -declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_shuf_f32x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f32x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP14]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_shuf_f64x2( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f64x2( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP14]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_f64x2( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP12]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) - ret <8 x double> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_shuf_i32x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i32x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X3:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X3]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_shuf_i64x2( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i64x2( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X3:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X3]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) - ret <8 x i64> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_shuf_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP14]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP12]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_shuf_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP14]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - ret <16 x float> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmaxs_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmaxs_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmaxu_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmaxu_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmins_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmins_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pminu_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pminu_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) #0 { -; -; CHECK-LABEL: @test_mm_mask_move_ss( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP0]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[__U:%.*]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[__U]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = or i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[TMP11]], -1 -; CHECK-NEXT: [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0 -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[__W:%.*]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP]], i32 [[_MSPROP1]] -; CHECK-NEXT: [[TMP20:%.*]] = bitcast float [[TMP17]] to i32 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP18]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP]] -; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP24]], i32 [[TMP19]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP16]], float [[TMP17]], float [[TMP18]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP25]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP26]] -; -entry: - %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U) - ret <4 x float> %res -} - - -define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) #0 { -; -; CHECK-LABEL: @test_mm_maskz_move_ss( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP0]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[__U:%.*]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[__U]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = xor i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[TMP10]], -1 -; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0 -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[_MSPROP]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP16]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = xor i32 [[TMP18]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP]] -; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP21]], i32 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP15]], float [[TMP16]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP22]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP23]] -; -entry: - %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U) - ret <4 x float> %res -} - -define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) #0 { -; -; CHECK-LABEL: @test_mm_mask_move_sd( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP0]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[__U:%.*]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[__U]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = or i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[TMP11]], -1 -; CHECK-NEXT: [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0 -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[__W:%.*]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP16]], i64 [[_MSPROP]], i64 [[_MSPROP1]] -; CHECK-NEXT: [[TMP20:%.*]] = bitcast double [[TMP17]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast double [[TMP18]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], [[_MSPROP]] -; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP24]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP16]], double [[TMP17]], double [[TMP18]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP25]], i64 0 -; CHECK-NEXT: store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[TMP26]] -; -entry: - %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U) - ret <2 x double> %res -} - -define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) #0 { -; -; CHECK-LABEL: @test_mm_maskz_move_sd( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP0]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[__U:%.*]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[__U]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = xor i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[TMP10]], -1 -; CHECK-NEXT: [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0 -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP15]], i64 [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast double [[TMP16]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP]] -; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[TMP20]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP21]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP15]], double [[TMP16]], double 0.000000e+00 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP22]], i64 0 -; CHECK-NEXT: store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[TMP23]] -; -entry: - %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U) - ret <2 x double> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8) -declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8) - -declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovzxd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i16> [[TMP2]] to <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i16> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovsxd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxd_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i32> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i16> [[TMP2]] to <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i16> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i16> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>) - -define <16 x i32>@test_int_x86_avx512_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1) - ret <16 x i32> %1 -} - -define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - ret <16 x i32> %3 -} - -define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - ret <16 x i32> %3 -} - -declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>) - -define <8 x i64>@test_int_x86_avx512_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1) - ret <8 x i64> %1 -} - -define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - ret <8 x i64> %3 -} - -define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - ret <8 x i64> %3 -} - -declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>) - -define <16 x i32>@test_int_x86_avx512_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1) - ret <16 x i32> %1 -} - -define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - ret <16 x i32> %3 -} - -define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - ret <16 x i32> %3 -} - -declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>) - -define <8 x i64>@test_int_x86_avx512_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1) - ret <8 x i64> %1 -} - -define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - ret <8 x i64> %3 -} - -define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - ret <8 x i64> %3 -} - -declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_prol_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %1 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - %4 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 4) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer - %7 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 5) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %6, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %7, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_prol_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %1 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - %4 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 4) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer - %7 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 5) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %6, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %7, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_pror_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %1 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - %4 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 4) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer - %7 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 5) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %6, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %7, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_pror_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %1 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - %4 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 4) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer - %7 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 5) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %6, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %7, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0:%.*]], i32 4) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 5) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 5) -; CHECK-NEXT: [[TMP17:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 6) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 6) -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP18]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP18]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP20]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]] -; CHECK-NEXT: [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP19]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[TMP15]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP16]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[_MSPROP_SELECT1]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 6, <8 x i64> zeroinitializer, i8 %x3) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -declare <16 x i32>@llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0:%.*]], i32 4) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 5) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 5) -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 6) -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 6) -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[TMP18]] -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]] -; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[TMP15]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP16]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[_MSPROP_SELECT1]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 6, <16 x i32> zeroinitializer, i16 %x3) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_psra_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 3) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0:%.*]], i32 3) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 5) -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 5) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_psra_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 3) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0:%.*]], i32 3) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 5) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 5) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_psll_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 3) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0:%.*]], i32 3) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 5) -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 5) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_psll_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 3) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0:%.*]], i32 3) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 4) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 5) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 5) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psll_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psll_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP19]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psll_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP18]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psll_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psll_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psll_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP18]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone - -define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrl_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrl_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP19]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP18]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrl_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrl_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP18]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone - -define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psra_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psra_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP19]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psra_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP18]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psra_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psra_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psra_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP18]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone - -define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psllv_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psllv_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psllv_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psllv_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone - - -define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrav_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrav_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrav_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrav_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone - -define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrlv_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrlv_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, ptr %ptr) #0 { -; -; CHECK-LABEL: @test_x86_avx512_psrlv_q_memop( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i64> [[_MSLD]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = sext <8 x i1> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP2]], <8 x i64> [[B]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[B]]) -; CHECK-NEXT: store <8 x i64> [[TMP11]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %b = load <8 x i64>, ptr %ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_cvt_dq2pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double> -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[CVT]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) - ret <8 x double> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_cvt_udq2pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double> -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[CVT]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) - ret <8 x double> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) #0 { -; CHECK-LABEL: @test_x86_vcvtph2ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) #0 { -; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> [[A1:%.*]], i16 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly - -define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) #0 { -; CHECK-LABEL: @test_valign_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[PALIGNR]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_valign_q( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> -; CHECK-NEXT: [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[PALIGNR]], [[SRC:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[PALIGNR]], <8 x i64> [[SRC]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) - -define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_valign_d( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[PALIGNR:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> [[A:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[PALIGNR]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[PALIGNR]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) - -declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP7]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP18]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - - -define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) - ret <16 x float> %res -} - -; Test case to make sure we can print shuffle decode comments for constant pool loads. -define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[X2]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP17]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] -; CHECK: 18: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 19: -; CHECK-NEXT: [[TMP20:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> ) -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x float> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = or <16 x i32> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP27]], <16 x i32> [[TMP23]] -; CHECK-NEXT: [[TMP28:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[TMP20]], <16 x float> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP29]], 0 -; CHECK-NEXT: br i1 [[_MSCMP4]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> ) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], [[_MSPROP_SELECT1]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[TMP16]], [[TMP28]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i32> zeroinitializer, [[_MSPROP]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP32]], [[RES3]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES4]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mask_mul_epi32_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) -; CHECK-NEXT: [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) -; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]] -; CHECK-NEXT: [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]] -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP28]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) -; CHECK-NEXT: [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] -; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP27]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP24]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP33]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] -; CHECK-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP32]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP25]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]] -; CHECK-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP34]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmbk_buildvector( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1 -; CHECK-NEXT: [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2 -; CHECK-NEXT: [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3 -; CHECK-NEXT: [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4 -; CHECK-NEXT: [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5 -; CHECK-NEXT: [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6 -; CHECK-NEXT: [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7 -; CHECK-NEXT: [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP8:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP8]] -; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]] -; CHECK-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP34]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0 - %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1 - %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2 - %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3 - %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4 - %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5 - %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6 - %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7 - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP33]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epi32_rmbkz_buildvector( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1 -; CHECK-NEXT: [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2 -; CHECK-NEXT: [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3 -; CHECK-NEXT: [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4 -; CHECK-NEXT: [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5 -; CHECK-NEXT: [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6 -; CHECK-NEXT: [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7 -; CHECK-NEXT: [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP8:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP8]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP33]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0 - %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1 - %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2 - %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3 - %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4 - %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5 - %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6 - %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7 - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) - -define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mask_mul_epu32_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]] -; CHECK-NEXT: [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]] -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP28]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] -; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP27]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP24]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP33]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] -; CHECK-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP32]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP25]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]] -; CHECK-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP34]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_mul_epu32_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP33]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask) - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) - -define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_vextractf32x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[A:%.*]], <16 x float> [[A]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP12]], <4 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[B]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP13]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) - ret <4 x float> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) - -define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_vextracti64x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> [[A]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i64> [[TMP4]], [[B:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP10]], <4 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP4]], <4 x i64> [[B]] -; CHECK-NEXT: store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x i64> [[TMP11]] -; - %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask) - ret <4 x i64> %res -} - -declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) - -define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_vextracti32x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[A]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x i32> [[TMP10]] -; - %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) - ret <4 x i32> %res -} - -declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) - -define <4 x double> @test_vextractf64x4(<8 x double> %a) #0 { -; CHECK-LABEL: @test_vextractf64x4( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> [[A]], <4 x i32> -; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x double> [[TMP2]] -; - %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1) - ret <4 x double> %res -} - -declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) - -declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_insertf32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP3]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP4]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_insertf32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP14]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> [[X3]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP15]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP12]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - ret <16 x float> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_inserti32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_inserti32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X3:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X3]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP13]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - ret <16 x i32> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_insertf64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP3]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP4]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_insertf64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP14]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> [[X3]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP15]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP12]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - ret <8 x double> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_inserti64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_inserti64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X3:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X3]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP13]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_movntdqa(ptr %a0) #0 { -; -; CHECK-LABEL: @test_x86_avx512_movntdqa( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 3: -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[A0:%.*]], align 64, !nontemporal [[META2]] -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64 -; CHECK-NEXT: store <8 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %res = call <8 x i64> @llvm.x86.avx512.movntdqa(ptr %a0) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly - -define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_cmp_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP12]], [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i32> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <16 x i32> [[TMP15]], [[TMP18]] -; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i1> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16 -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP24]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP25]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP27:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP28:%.*]] = and <16 x i32> [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i32> [[TMP26]], [[TMP1]] -; CHECK-NEXT: [[TMP30:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP31:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP32:%.*]] = and <16 x i32> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = or <16 x i32> [[TMP30]], [[TMP2]] -; CHECK-NEXT: [[TMP34:%.*]] = icmp ule <16 x i32> [[TMP28]], [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp ule <16 x i32> [[TMP29]], [[TMP32]] -; CHECK-NEXT: [[TMP36:%.*]] = xor <16 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i1> [[TMP36]] to i16 -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP38]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP39]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = xor <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i32> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP43:%.*]] = xor <16 x i32> [[TMP41]], splat (i32 -1) -; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i32> [[TMP43]], [[TMP40]] -; CHECK-NEXT: [[TMP45:%.*]] = icmp eq <16 x i32> [[TMP44]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP42]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP47:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16 -; CHECK-NEXT: [[TMP48:%.*]] = bitcast <16 x i1> [[TMP46]] to i16 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP47]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP48]], i32 4 -; CHECK-NEXT: [[TMP49:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP50:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP51:%.*]] = and <16 x i32> [[TMP49]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = or <16 x i32> [[TMP49]], [[TMP1]] -; CHECK-NEXT: [[TMP53:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP54:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP55:%.*]] = and <16 x i32> [[TMP53]], [[TMP54]] -; CHECK-NEXT: [[TMP56:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP57:%.*]] = icmp uge <16 x i32> [[TMP51]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = icmp uge <16 x i32> [[TMP52]], [[TMP55]] -; CHECK-NEXT: [[TMP59:%.*]] = xor <16 x i1> [[TMP57]], [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP61:%.*]] = bitcast <16 x i1> [[TMP59]] to i16 -; CHECK-NEXT: [[TMP62:%.*]] = bitcast <16 x i1> [[TMP60]] to i16 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP61]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP62]], i32 5 -; CHECK-NEXT: [[TMP63:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP64:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP65:%.*]] = and <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = or <16 x i32> [[TMP63]], [[TMP1]] -; CHECK-NEXT: [[TMP67:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP68:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP69:%.*]] = and <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = or <16 x i32> [[TMP67]], [[TMP2]] -; CHECK-NEXT: [[TMP71:%.*]] = icmp ugt <16 x i32> [[TMP65]], [[TMP70]] -; CHECK-NEXT: [[TMP72:%.*]] = icmp ugt <16 x i32> [[TMP66]], [[TMP69]] -; CHECK-NEXT: [[TMP73:%.*]] = xor <16 x i1> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP75:%.*]] = bitcast <16 x i1> [[TMP73]] to i16 -; CHECK-NEXT: [[TMP76:%.*]] = bitcast <16 x i1> [[TMP74]] to i16 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP75]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP76]], i32 6 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7 -; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[VEC7]] -; - %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) - %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 - %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) - %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 - %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) - %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 - %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) - %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 - %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) - %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 - %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) - %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 - %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) - %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 - %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) - %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 - ret <8 x i16> %vec7 -} - -define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_cmp_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1) -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i32> [[TMP21]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i32> [[TMP21]], [[TMP1]] -; CHECK-NEXT: [[TMP25:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP27:%.*]] = and <16 x i32> [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = or <16 x i32> [[TMP25]], [[TMP2]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <16 x i32> [[TMP24]], [[TMP27]] -; CHECK-NEXT: [[TMP31:%.*]] = xor <16 x i1> [[TMP29]], [[TMP30]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]] -; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i1> [[TMP31]], [[TMP34]] -; CHECK-NEXT: [[TMP38:%.*]] = or <16 x i1> [[TMP35]], [[TMP36]] -; CHECK-NEXT: [[TMP39:%.*]] = or <16 x i1> [[TMP38]], [[TMP37]] -; CHECK-NEXT: [[TMP40:%.*]] = and <16 x i1> [[TMP32]], [[TMP34]] -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16 -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP41]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP43:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP44:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP45:%.*]] = and <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or <16 x i32> [[TMP43]], [[TMP1]] -; CHECK-NEXT: [[TMP47:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i32> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i32> [[TMP47]], [[TMP2]] -; CHECK-NEXT: [[TMP51:%.*]] = icmp ule <16 x i32> [[TMP45]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = icmp ule <16 x i32> [[TMP46]], [[TMP49]] -; CHECK-NEXT: [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP55:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP56:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP57:%.*]] = and <16 x i1> [[TMP53]], [[TMP55]] -; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP54]], [[TMP55]] -; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP53]], [[TMP56]] -; CHECK-NEXT: [[TMP60:%.*]] = or <16 x i1> [[TMP57]], [[TMP58]] -; CHECK-NEXT: [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]] -; CHECK-NEXT: [[TMP62:%.*]] = and <16 x i1> [[TMP54]], [[TMP56]] -; CHECK-NEXT: [[TMP63:%.*]] = bitcast <16 x i1> [[TMP61]] to i16 -; CHECK-NEXT: [[TMP64:%.*]] = bitcast <16 x i1> [[TMP62]] to i16 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP63]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP64]], i32 2 -; CHECK-NEXT: [[TMP65:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP66:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP67:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]] -; CHECK-NEXT: [[TMP69:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]] -; CHECK-NEXT: [[TMP70:%.*]] = or <16 x i1> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP71:%.*]] = or <16 x i1> [[TMP70]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]] -; CHECK-NEXT: [[TMP73:%.*]] = bitcast <16 x i1> [[TMP71]] to i16 -; CHECK-NEXT: [[TMP74:%.*]] = bitcast <16 x i1> [[TMP72]] to i16 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP73]], i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP74]], i32 3 -; CHECK-NEXT: [[TMP75:%.*]] = xor <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP76:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <16 x i32> [[TMP76]], zeroinitializer -; CHECK-NEXT: [[TMP78:%.*]] = xor <16 x i32> [[TMP76]], splat (i32 -1) -; CHECK-NEXT: [[TMP79:%.*]] = and <16 x i32> [[TMP78]], [[TMP75]] -; CHECK-NEXT: [[TMP80:%.*]] = icmp eq <16 x i32> [[TMP79]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP77]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP82:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP83:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP84:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP82]] -; CHECK-NEXT: [[TMP85:%.*]] = and <16 x i1> [[TMP81]], [[TMP82]] -; CHECK-NEXT: [[TMP86:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP83]] -; CHECK-NEXT: [[TMP87:%.*]] = or <16 x i1> [[TMP84]], [[TMP85]] -; CHECK-NEXT: [[TMP88:%.*]] = or <16 x i1> [[TMP87]], [[TMP86]] -; CHECK-NEXT: [[TMP89:%.*]] = and <16 x i1> [[TMP81]], [[TMP83]] -; CHECK-NEXT: [[TMP90:%.*]] = bitcast <16 x i1> [[TMP88]] to i16 -; CHECK-NEXT: [[TMP91:%.*]] = bitcast <16 x i1> [[TMP89]] to i16 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP90]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP91]], i32 4 -; CHECK-NEXT: [[TMP92:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP93:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP94:%.*]] = and <16 x i32> [[TMP92]], [[TMP93]] -; CHECK-NEXT: [[TMP95:%.*]] = or <16 x i32> [[TMP92]], [[TMP1]] -; CHECK-NEXT: [[TMP96:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP97:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP98:%.*]] = and <16 x i32> [[TMP96]], [[TMP97]] -; CHECK-NEXT: [[TMP99:%.*]] = or <16 x i32> [[TMP96]], [[TMP2]] -; CHECK-NEXT: [[TMP100:%.*]] = icmp uge <16 x i32> [[TMP94]], [[TMP99]] -; CHECK-NEXT: [[TMP101:%.*]] = icmp uge <16 x i32> [[TMP95]], [[TMP98]] -; CHECK-NEXT: [[TMP102:%.*]] = xor <16 x i1> [[TMP100]], [[TMP101]] -; CHECK-NEXT: [[TMP103:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP104:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP105:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP106:%.*]] = and <16 x i1> [[TMP102]], [[TMP104]] -; CHECK-NEXT: [[TMP107:%.*]] = and <16 x i1> [[TMP103]], [[TMP104]] -; CHECK-NEXT: [[TMP108:%.*]] = and <16 x i1> [[TMP102]], [[TMP105]] -; CHECK-NEXT: [[TMP109:%.*]] = or <16 x i1> [[TMP106]], [[TMP107]] -; CHECK-NEXT: [[TMP110:%.*]] = or <16 x i1> [[TMP109]], [[TMP108]] -; CHECK-NEXT: [[TMP111:%.*]] = and <16 x i1> [[TMP103]], [[TMP105]] -; CHECK-NEXT: [[TMP112:%.*]] = bitcast <16 x i1> [[TMP110]] to i16 -; CHECK-NEXT: [[TMP113:%.*]] = bitcast <16 x i1> [[TMP111]] to i16 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP112]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP113]], i32 5 -; CHECK-NEXT: [[TMP114:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP115:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP116:%.*]] = and <16 x i32> [[TMP114]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = or <16 x i32> [[TMP114]], [[TMP1]] -; CHECK-NEXT: [[TMP118:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648) -; CHECK-NEXT: [[TMP119:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP120:%.*]] = and <16 x i32> [[TMP118]], [[TMP119]] -; CHECK-NEXT: [[TMP121:%.*]] = or <16 x i32> [[TMP118]], [[TMP2]] -; CHECK-NEXT: [[TMP122:%.*]] = icmp ugt <16 x i32> [[TMP116]], [[TMP121]] -; CHECK-NEXT: [[TMP123:%.*]] = icmp ugt <16 x i32> [[TMP117]], [[TMP120]] -; CHECK-NEXT: [[TMP124:%.*]] = xor <16 x i1> [[TMP122]], [[TMP123]] -; CHECK-NEXT: [[TMP125:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP126:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP127:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP128:%.*]] = and <16 x i1> [[TMP124]], [[TMP126]] -; CHECK-NEXT: [[TMP129:%.*]] = and <16 x i1> [[TMP125]], [[TMP126]] -; CHECK-NEXT: [[TMP130:%.*]] = and <16 x i1> [[TMP124]], [[TMP127]] -; CHECK-NEXT: [[TMP131:%.*]] = or <16 x i1> [[TMP128]], [[TMP129]] -; CHECK-NEXT: [[TMP132:%.*]] = or <16 x i1> [[TMP131]], [[TMP130]] -; CHECK-NEXT: [[TMP133:%.*]] = and <16 x i1> [[TMP125]], [[TMP127]] -; CHECK-NEXT: [[TMP134:%.*]] = bitcast <16 x i1> [[TMP132]] to i16 -; CHECK-NEXT: [[TMP135:%.*]] = bitcast <16 x i1> [[TMP133]] to i16 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP134]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP135]], i32 6 -; CHECK-NEXT: [[TMP136:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP137:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP138:%.*]] = and <16 x i1> zeroinitializer, [[TMP136]] -; CHECK-NEXT: [[TMP139:%.*]] = and <16 x i1> splat (i1 true), [[TMP136]] -; CHECK-NEXT: [[TMP140:%.*]] = and <16 x i1> zeroinitializer, [[TMP137]] -; CHECK-NEXT: [[TMP141:%.*]] = or <16 x i1> [[TMP138]], [[TMP139]] -; CHECK-NEXT: [[TMP142:%.*]] = or <16 x i1> [[TMP141]], [[TMP140]] -; CHECK-NEXT: [[TMP143:%.*]] = and <16 x i1> splat (i1 true), [[TMP137]] -; CHECK-NEXT: [[TMP144:%.*]] = bitcast <16 x i1> [[TMP142]] to i16 -; CHECK-NEXT: [[TMP145:%.*]] = bitcast <16 x i1> [[TMP143]] to i16 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP144]], i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP145]], i32 7 -; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[VEC7]] -; - %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) - %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 - %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) - %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 - %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) - %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 - %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) - %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 - %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) - %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 - %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) - %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 - %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) - %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 - %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) - %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 - ret <8 x i16> %vec7 -} - -declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone - -define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_ucmp_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[A0]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i32> [[A1]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <16 x i32> [[TMP13]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i1> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i1> [[TMP20]] to i16 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP22]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP23]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP25:%.*]] = and <16 x i32> [[A0]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP27:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP28:%.*]] = and <16 x i32> [[A1]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ule <16 x i32> [[TMP25]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = icmp ule <16 x i32> [[TMP26]], [[TMP28]] -; CHECK-NEXT: [[TMP32:%.*]] = xor <16 x i1> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast <16 x i1> [[TMP32]] to i16 -; CHECK-NEXT: [[TMP35:%.*]] = bitcast <16 x i1> [[TMP33]] to i16 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP34]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP35]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3 -; CHECK-NEXT: [[TMP36:%.*]] = xor <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP37:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP38:%.*]] = icmp ne <16 x i32> [[TMP37]], zeroinitializer -; CHECK-NEXT: [[TMP39:%.*]] = xor <16 x i32> [[TMP37]], splat (i32 -1) -; CHECK-NEXT: [[TMP40:%.*]] = and <16 x i32> [[TMP39]], [[TMP36]] -; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <16 x i32> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP38]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP43:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16 -; CHECK-NEXT: [[TMP44:%.*]] = bitcast <16 x i1> [[TMP42]] to i16 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP43]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP44]], i32 4 -; CHECK-NEXT: [[TMP45:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP46:%.*]] = and <16 x i32> [[A0]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i32> [[A1]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP51:%.*]] = icmp uge <16 x i32> [[TMP46]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = icmp uge <16 x i32> [[TMP47]], [[TMP49]] -; CHECK-NEXT: [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP55:%.*]] = bitcast <16 x i1> [[TMP53]] to i16 -; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i1> [[TMP54]] to i16 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP55]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP56]], i32 5 -; CHECK-NEXT: [[TMP57:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i32> [[A0]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP60:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP61:%.*]] = and <16 x i32> [[A1]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP63:%.*]] = icmp ugt <16 x i32> [[TMP58]], [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = icmp ugt <16 x i32> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP65:%.*]] = xor <16 x i1> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP67:%.*]] = bitcast <16 x i1> [[TMP65]] to i16 -; CHECK-NEXT: [[TMP68:%.*]] = bitcast <16 x i1> [[TMP66]] to i16 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP67]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP68]], i32 6 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7 -; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[VEC7]] -; - %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) - %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 - %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) - %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 - %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) - %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 - %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) - %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 - %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) - %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 - %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) - %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 - %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) - %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 - %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) - %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 - ret <8 x i16> %vec7 -} - -define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_ucmp_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1) -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i32> [[A0]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP25:%.*]] = and <16 x i32> [[A1]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP22]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP29:%.*]] = xor <16 x i1> [[TMP27]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP31:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP33:%.*]] = and <16 x i1> [[TMP29]], [[TMP31]] -; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i1> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i1> [[TMP29]], [[TMP32]] -; CHECK-NEXT: [[TMP36:%.*]] = or <16 x i1> [[TMP33]], [[TMP34]] -; CHECK-NEXT: [[TMP37:%.*]] = or <16 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16 -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP39]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP40]], i32 1 -; CHECK-NEXT: [[TMP41:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP42:%.*]] = and <16 x i32> [[A0]], [[TMP41]] -; CHECK-NEXT: [[TMP43:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP44:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP45:%.*]] = and <16 x i32> [[A1]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP47:%.*]] = icmp ule <16 x i32> [[TMP42]], [[TMP46]] -; CHECK-NEXT: [[TMP48:%.*]] = icmp ule <16 x i32> [[TMP43]], [[TMP45]] -; CHECK-NEXT: [[TMP49:%.*]] = xor <16 x i1> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP51:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP52:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP53:%.*]] = and <16 x i1> [[TMP49]], [[TMP51]] -; CHECK-NEXT: [[TMP54:%.*]] = and <16 x i1> [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP55:%.*]] = and <16 x i1> [[TMP49]], [[TMP52]] -; CHECK-NEXT: [[TMP56:%.*]] = or <16 x i1> [[TMP53]], [[TMP54]] -; CHECK-NEXT: [[TMP57:%.*]] = or <16 x i1> [[TMP56]], [[TMP55]] -; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP50]], [[TMP52]] -; CHECK-NEXT: [[TMP59:%.*]] = bitcast <16 x i1> [[TMP57]] to i16 -; CHECK-NEXT: [[TMP60:%.*]] = bitcast <16 x i1> [[TMP58]] to i16 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP59]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP60]], i32 2 -; CHECK-NEXT: [[TMP61:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP62:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP63:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]] -; CHECK-NEXT: [[TMP65:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]] -; CHECK-NEXT: [[TMP66:%.*]] = or <16 x i1> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP67:%.*]] = or <16 x i1> [[TMP66]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]] -; CHECK-NEXT: [[TMP69:%.*]] = bitcast <16 x i1> [[TMP67]] to i16 -; CHECK-NEXT: [[TMP70:%.*]] = bitcast <16 x i1> [[TMP68]] to i16 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP69]], i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP70]], i32 3 -; CHECK-NEXT: [[TMP71:%.*]] = xor <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP72:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP73:%.*]] = icmp ne <16 x i32> [[TMP72]], zeroinitializer -; CHECK-NEXT: [[TMP74:%.*]] = xor <16 x i32> [[TMP72]], splat (i32 -1) -; CHECK-NEXT: [[TMP75:%.*]] = and <16 x i32> [[TMP74]], [[TMP71]] -; CHECK-NEXT: [[TMP76:%.*]] = icmp eq <16 x i32> [[TMP75]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP73]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP78:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP79:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP80:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = and <16 x i1> [[TMP77]], [[TMP78]] -; CHECK-NEXT: [[TMP82:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP79]] -; CHECK-NEXT: [[TMP83:%.*]] = or <16 x i1> [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[TMP84:%.*]] = or <16 x i1> [[TMP83]], [[TMP82]] -; CHECK-NEXT: [[TMP85:%.*]] = and <16 x i1> [[TMP77]], [[TMP79]] -; CHECK-NEXT: [[TMP86:%.*]] = bitcast <16 x i1> [[TMP84]] to i16 -; CHECK-NEXT: [[TMP87:%.*]] = bitcast <16 x i1> [[TMP85]] to i16 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP86]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP87]], i32 4 -; CHECK-NEXT: [[TMP88:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP89:%.*]] = and <16 x i32> [[A0]], [[TMP88]] -; CHECK-NEXT: [[TMP90:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP91:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP92:%.*]] = and <16 x i32> [[A1]], [[TMP91]] -; CHECK-NEXT: [[TMP93:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP94:%.*]] = icmp uge <16 x i32> [[TMP89]], [[TMP93]] -; CHECK-NEXT: [[TMP95:%.*]] = icmp uge <16 x i32> [[TMP90]], [[TMP92]] -; CHECK-NEXT: [[TMP96:%.*]] = xor <16 x i1> [[TMP94]], [[TMP95]] -; CHECK-NEXT: [[TMP97:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP98:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP99:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP100:%.*]] = and <16 x i1> [[TMP96]], [[TMP98]] -; CHECK-NEXT: [[TMP101:%.*]] = and <16 x i1> [[TMP97]], [[TMP98]] -; CHECK-NEXT: [[TMP102:%.*]] = and <16 x i1> [[TMP96]], [[TMP99]] -; CHECK-NEXT: [[TMP103:%.*]] = or <16 x i1> [[TMP100]], [[TMP101]] -; CHECK-NEXT: [[TMP104:%.*]] = or <16 x i1> [[TMP103]], [[TMP102]] -; CHECK-NEXT: [[TMP105:%.*]] = and <16 x i1> [[TMP97]], [[TMP99]] -; CHECK-NEXT: [[TMP106:%.*]] = bitcast <16 x i1> [[TMP104]] to i16 -; CHECK-NEXT: [[TMP107:%.*]] = bitcast <16 x i1> [[TMP105]] to i16 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP106]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP107]], i32 5 -; CHECK-NEXT: [[TMP108:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1) -; CHECK-NEXT: [[TMP109:%.*]] = and <16 x i32> [[A0]], [[TMP108]] -; CHECK-NEXT: [[TMP110:%.*]] = or <16 x i32> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP111:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP112:%.*]] = and <16 x i32> [[A1]], [[TMP111]] -; CHECK-NEXT: [[TMP113:%.*]] = or <16 x i32> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP114:%.*]] = icmp ugt <16 x i32> [[TMP109]], [[TMP113]] -; CHECK-NEXT: [[TMP115:%.*]] = icmp ugt <16 x i32> [[TMP110]], [[TMP112]] -; CHECK-NEXT: [[TMP116:%.*]] = xor <16 x i1> [[TMP114]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP118:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP119:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP120:%.*]] = and <16 x i1> [[TMP116]], [[TMP118]] -; CHECK-NEXT: [[TMP121:%.*]] = and <16 x i1> [[TMP117]], [[TMP118]] -; CHECK-NEXT: [[TMP122:%.*]] = and <16 x i1> [[TMP116]], [[TMP119]] -; CHECK-NEXT: [[TMP123:%.*]] = or <16 x i1> [[TMP120]], [[TMP121]] -; CHECK-NEXT: [[TMP124:%.*]] = or <16 x i1> [[TMP123]], [[TMP122]] -; CHECK-NEXT: [[TMP125:%.*]] = and <16 x i1> [[TMP117]], [[TMP119]] -; CHECK-NEXT: [[TMP126:%.*]] = bitcast <16 x i1> [[TMP124]] to i16 -; CHECK-NEXT: [[TMP127:%.*]] = bitcast <16 x i1> [[TMP125]] to i16 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP126]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP127]], i32 6 -; CHECK-NEXT: [[TMP128:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP129:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP130:%.*]] = and <16 x i1> zeroinitializer, [[TMP128]] -; CHECK-NEXT: [[TMP131:%.*]] = and <16 x i1> splat (i1 true), [[TMP128]] -; CHECK-NEXT: [[TMP132:%.*]] = and <16 x i1> zeroinitializer, [[TMP129]] -; CHECK-NEXT: [[TMP133:%.*]] = or <16 x i1> [[TMP130]], [[TMP131]] -; CHECK-NEXT: [[TMP134:%.*]] = or <16 x i1> [[TMP133]], [[TMP132]] -; CHECK-NEXT: [[TMP135:%.*]] = and <16 x i1> splat (i1 true), [[TMP129]] -; CHECK-NEXT: [[TMP136:%.*]] = bitcast <16 x i1> [[TMP134]] to i16 -; CHECK-NEXT: [[TMP137:%.*]] = bitcast <16 x i1> [[TMP135]] to i16 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP136]], i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP137]], i32 7 -; CHECK-NEXT: store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[VEC7]] -; - %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) - %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 - %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) - %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 - %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) - %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 - %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) - %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 - %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) - %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 - %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) - %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 - %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) - %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 - %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) - %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 - ret <8 x i16> %vec7 -} - -declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone - -define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_cmp_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP16]], [[TMP2]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <8 x i64> [[TMP15]], [[TMP18]] -; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i1> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8 -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP24]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP25]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP27:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP28:%.*]] = and <8 x i64> [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[TMP26]], [[TMP1]] -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP32:%.*]] = and <8 x i64> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = or <8 x i64> [[TMP30]], [[TMP2]] -; CHECK-NEXT: [[TMP34:%.*]] = icmp ule <8 x i64> [[TMP28]], [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp ule <8 x i64> [[TMP29]], [[TMP32]] -; CHECK-NEXT: [[TMP36:%.*]] = xor <8 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i1> [[TMP36]] to i8 -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP38]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP39]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = xor <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP41:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i64> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP43:%.*]] = xor <8 x i64> [[TMP41]], splat (i64 -1) -; CHECK-NEXT: [[TMP44:%.*]] = and <8 x i64> [[TMP43]], [[TMP40]] -; CHECK-NEXT: [[TMP45:%.*]] = icmp eq <8 x i64> [[TMP44]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP42]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP47:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8 -; CHECK-NEXT: [[TMP48:%.*]] = bitcast <8 x i1> [[TMP46]] to i8 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP47]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP48]], i32 4 -; CHECK-NEXT: [[TMP49:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP50:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP51:%.*]] = and <8 x i64> [[TMP49]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = or <8 x i64> [[TMP49]], [[TMP1]] -; CHECK-NEXT: [[TMP53:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP54:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP55:%.*]] = and <8 x i64> [[TMP53]], [[TMP54]] -; CHECK-NEXT: [[TMP56:%.*]] = or <8 x i64> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP57:%.*]] = icmp uge <8 x i64> [[TMP51]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = icmp uge <8 x i64> [[TMP52]], [[TMP55]] -; CHECK-NEXT: [[TMP59:%.*]] = xor <8 x i1> [[TMP57]], [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP61:%.*]] = bitcast <8 x i1> [[TMP59]] to i8 -; CHECK-NEXT: [[TMP62:%.*]] = bitcast <8 x i1> [[TMP60]] to i8 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP61]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP62]], i32 5 -; CHECK-NEXT: [[TMP63:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP64:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP65:%.*]] = and <8 x i64> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = or <8 x i64> [[TMP63]], [[TMP1]] -; CHECK-NEXT: [[TMP67:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP68:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP69:%.*]] = and <8 x i64> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = or <8 x i64> [[TMP67]], [[TMP2]] -; CHECK-NEXT: [[TMP71:%.*]] = icmp ugt <8 x i64> [[TMP65]], [[TMP70]] -; CHECK-NEXT: [[TMP72:%.*]] = icmp ugt <8 x i64> [[TMP66]], [[TMP69]] -; CHECK-NEXT: [[TMP73:%.*]] = xor <8 x i1> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP75:%.*]] = bitcast <8 x i1> [[TMP73]] to i8 -; CHECK-NEXT: [[TMP76:%.*]] = bitcast <8 x i1> [[TMP74]] to i8 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP75]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP76]], i32 6 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7 -; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i8> [[VEC7]] -; - %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) - %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 - %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) - %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 - %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) - %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 - %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) - %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 - %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) - %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 - %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) - %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 - %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) - %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 - %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) - %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 - ret <8 x i8> %vec7 -} - -define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_cmp_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1) -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP21]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP1]] -; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP26:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i64> [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = or <8 x i64> [[TMP25]], [[TMP2]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <8 x i64> [[TMP24]], [[TMP27]] -; CHECK-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP29]], [[TMP30]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]] -; CHECK-NEXT: [[TMP36:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[TMP37:%.*]] = and <8 x i1> [[TMP31]], [[TMP34]] -; CHECK-NEXT: [[TMP38:%.*]] = or <8 x i1> [[TMP35]], [[TMP36]] -; CHECK-NEXT: [[TMP39:%.*]] = or <8 x i1> [[TMP38]], [[TMP37]] -; CHECK-NEXT: [[TMP40:%.*]] = and <8 x i1> [[TMP32]], [[TMP34]] -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8 -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i1> [[TMP40]] to i8 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP41]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP43:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP44:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP45:%.*]] = and <8 x i64> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or <8 x i64> [[TMP43]], [[TMP1]] -; CHECK-NEXT: [[TMP47:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP49:%.*]] = and <8 x i64> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = or <8 x i64> [[TMP47]], [[TMP2]] -; CHECK-NEXT: [[TMP51:%.*]] = icmp ule <8 x i64> [[TMP45]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = icmp ule <8 x i64> [[TMP46]], [[TMP49]] -; CHECK-NEXT: [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP55:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP56:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP57:%.*]] = and <8 x i1> [[TMP53]], [[TMP55]] -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i1> [[TMP54]], [[TMP55]] -; CHECK-NEXT: [[TMP59:%.*]] = and <8 x i1> [[TMP53]], [[TMP56]] -; CHECK-NEXT: [[TMP60:%.*]] = or <8 x i1> [[TMP57]], [[TMP58]] -; CHECK-NEXT: [[TMP61:%.*]] = or <8 x i1> [[TMP60]], [[TMP59]] -; CHECK-NEXT: [[TMP62:%.*]] = and <8 x i1> [[TMP54]], [[TMP56]] -; CHECK-NEXT: [[TMP63:%.*]] = bitcast <8 x i1> [[TMP61]] to i8 -; CHECK-NEXT: [[TMP64:%.*]] = bitcast <8 x i1> [[TMP62]] to i8 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP63]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP64]], i32 2 -; CHECK-NEXT: [[TMP65:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP66:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP67:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]] -; CHECK-NEXT: [[TMP69:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]] -; CHECK-NEXT: [[TMP70:%.*]] = or <8 x i1> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP71:%.*]] = or <8 x i1> [[TMP70]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]] -; CHECK-NEXT: [[TMP73:%.*]] = bitcast <8 x i1> [[TMP71]] to i8 -; CHECK-NEXT: [[TMP74:%.*]] = bitcast <8 x i1> [[TMP72]] to i8 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP73]], i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP74]], i32 3 -; CHECK-NEXT: [[TMP75:%.*]] = xor <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP76:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <8 x i64> [[TMP76]], zeroinitializer -; CHECK-NEXT: [[TMP78:%.*]] = xor <8 x i64> [[TMP76]], splat (i64 -1) -; CHECK-NEXT: [[TMP79:%.*]] = and <8 x i64> [[TMP78]], [[TMP75]] -; CHECK-NEXT: [[TMP80:%.*]] = icmp eq <8 x i64> [[TMP79]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP77]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP82:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP83:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP84:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP82]] -; CHECK-NEXT: [[TMP85:%.*]] = and <8 x i1> [[TMP81]], [[TMP82]] -; CHECK-NEXT: [[TMP86:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP83]] -; CHECK-NEXT: [[TMP87:%.*]] = or <8 x i1> [[TMP84]], [[TMP85]] -; CHECK-NEXT: [[TMP88:%.*]] = or <8 x i1> [[TMP87]], [[TMP86]] -; CHECK-NEXT: [[TMP89:%.*]] = and <8 x i1> [[TMP81]], [[TMP83]] -; CHECK-NEXT: [[TMP90:%.*]] = bitcast <8 x i1> [[TMP88]] to i8 -; CHECK-NEXT: [[TMP91:%.*]] = bitcast <8 x i1> [[TMP89]] to i8 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP90]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP91]], i32 4 -; CHECK-NEXT: [[TMP92:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP93:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP94:%.*]] = and <8 x i64> [[TMP92]], [[TMP93]] -; CHECK-NEXT: [[TMP95:%.*]] = or <8 x i64> [[TMP92]], [[TMP1]] -; CHECK-NEXT: [[TMP96:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP97:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP98:%.*]] = and <8 x i64> [[TMP96]], [[TMP97]] -; CHECK-NEXT: [[TMP99:%.*]] = or <8 x i64> [[TMP96]], [[TMP2]] -; CHECK-NEXT: [[TMP100:%.*]] = icmp uge <8 x i64> [[TMP94]], [[TMP99]] -; CHECK-NEXT: [[TMP101:%.*]] = icmp uge <8 x i64> [[TMP95]], [[TMP98]] -; CHECK-NEXT: [[TMP102:%.*]] = xor <8 x i1> [[TMP100]], [[TMP101]] -; CHECK-NEXT: [[TMP103:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP104:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP105:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP106:%.*]] = and <8 x i1> [[TMP102]], [[TMP104]] -; CHECK-NEXT: [[TMP107:%.*]] = and <8 x i1> [[TMP103]], [[TMP104]] -; CHECK-NEXT: [[TMP108:%.*]] = and <8 x i1> [[TMP102]], [[TMP105]] -; CHECK-NEXT: [[TMP109:%.*]] = or <8 x i1> [[TMP106]], [[TMP107]] -; CHECK-NEXT: [[TMP110:%.*]] = or <8 x i1> [[TMP109]], [[TMP108]] -; CHECK-NEXT: [[TMP111:%.*]] = and <8 x i1> [[TMP103]], [[TMP105]] -; CHECK-NEXT: [[TMP112:%.*]] = bitcast <8 x i1> [[TMP110]] to i8 -; CHECK-NEXT: [[TMP113:%.*]] = bitcast <8 x i1> [[TMP111]] to i8 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP112]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP113]], i32 5 -; CHECK-NEXT: [[TMP114:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP115:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP116:%.*]] = and <8 x i64> [[TMP114]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = or <8 x i64> [[TMP114]], [[TMP1]] -; CHECK-NEXT: [[TMP118:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808) -; CHECK-NEXT: [[TMP119:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP120:%.*]] = and <8 x i64> [[TMP118]], [[TMP119]] -; CHECK-NEXT: [[TMP121:%.*]] = or <8 x i64> [[TMP118]], [[TMP2]] -; CHECK-NEXT: [[TMP122:%.*]] = icmp ugt <8 x i64> [[TMP116]], [[TMP121]] -; CHECK-NEXT: [[TMP123:%.*]] = icmp ugt <8 x i64> [[TMP117]], [[TMP120]] -; CHECK-NEXT: [[TMP124:%.*]] = xor <8 x i1> [[TMP122]], [[TMP123]] -; CHECK-NEXT: [[TMP125:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP126:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP127:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP128:%.*]] = and <8 x i1> [[TMP124]], [[TMP126]] -; CHECK-NEXT: [[TMP129:%.*]] = and <8 x i1> [[TMP125]], [[TMP126]] -; CHECK-NEXT: [[TMP130:%.*]] = and <8 x i1> [[TMP124]], [[TMP127]] -; CHECK-NEXT: [[TMP131:%.*]] = or <8 x i1> [[TMP128]], [[TMP129]] -; CHECK-NEXT: [[TMP132:%.*]] = or <8 x i1> [[TMP131]], [[TMP130]] -; CHECK-NEXT: [[TMP133:%.*]] = and <8 x i1> [[TMP125]], [[TMP127]] -; CHECK-NEXT: [[TMP134:%.*]] = bitcast <8 x i1> [[TMP132]] to i8 -; CHECK-NEXT: [[TMP135:%.*]] = bitcast <8 x i1> [[TMP133]] to i8 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP134]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP135]], i32 6 -; CHECK-NEXT: [[TMP136:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP137:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP138:%.*]] = and <8 x i1> zeroinitializer, [[TMP136]] -; CHECK-NEXT: [[TMP139:%.*]] = and <8 x i1> splat (i1 true), [[TMP136]] -; CHECK-NEXT: [[TMP140:%.*]] = and <8 x i1> zeroinitializer, [[TMP137]] -; CHECK-NEXT: [[TMP141:%.*]] = or <8 x i1> [[TMP138]], [[TMP139]] -; CHECK-NEXT: [[TMP142:%.*]] = or <8 x i1> [[TMP141]], [[TMP140]] -; CHECK-NEXT: [[TMP143:%.*]] = and <8 x i1> splat (i1 true), [[TMP137]] -; CHECK-NEXT: [[TMP144:%.*]] = bitcast <8 x i1> [[TMP142]] to i8 -; CHECK-NEXT: [[TMP145:%.*]] = bitcast <8 x i1> [[TMP143]] to i8 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP144]], i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP145]], i32 7 -; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i8> [[VEC7]] -; - %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) - %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 - %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) - %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 - %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) - %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 - %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) - %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 - %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) - %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 - %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) - %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 - %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) - %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 - %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) - %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 - ret <8 x i8> %vec7 -} - -declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone - -define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_ucmp_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1) -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[A0]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[A1]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <8 x i64> [[TMP13]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i1> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i1> [[TMP20]] to i8 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i1> [[TMP21]] to i8 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP22]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP23]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[A0]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP27:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP28:%.*]] = and <8 x i64> [[A1]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ule <8 x i64> [[TMP25]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = icmp ule <8 x i64> [[TMP26]], [[TMP28]] -; CHECK-NEXT: [[TMP32:%.*]] = xor <8 x i1> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x i1> [[TMP32]] to i8 -; CHECK-NEXT: [[TMP35:%.*]] = bitcast <8 x i1> [[TMP33]] to i8 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP34]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP35]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3 -; CHECK-NEXT: [[TMP36:%.*]] = xor <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP37:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP38:%.*]] = icmp ne <8 x i64> [[TMP37]], zeroinitializer -; CHECK-NEXT: [[TMP39:%.*]] = xor <8 x i64> [[TMP37]], splat (i64 -1) -; CHECK-NEXT: [[TMP40:%.*]] = and <8 x i64> [[TMP39]], [[TMP36]] -; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <8 x i64> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP38]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP43:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8 -; CHECK-NEXT: [[TMP44:%.*]] = bitcast <8 x i1> [[TMP42]] to i8 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP43]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP44]], i32 4 -; CHECK-NEXT: [[TMP45:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP46:%.*]] = and <8 x i64> [[A0]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP49:%.*]] = and <8 x i64> [[A1]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP51:%.*]] = icmp uge <8 x i64> [[TMP46]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = icmp uge <8 x i64> [[TMP47]], [[TMP49]] -; CHECK-NEXT: [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP55:%.*]] = bitcast <8 x i1> [[TMP53]] to i8 -; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i1> [[TMP54]] to i8 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP55]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP56]], i32 5 -; CHECK-NEXT: [[TMP57:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i64> [[A0]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP60:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP61:%.*]] = and <8 x i64> [[A1]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP63:%.*]] = icmp ugt <8 x i64> [[TMP58]], [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = icmp ugt <8 x i64> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP65:%.*]] = xor <8 x i1> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP67:%.*]] = bitcast <8 x i1> [[TMP65]] to i8 -; CHECK-NEXT: [[TMP68:%.*]] = bitcast <8 x i1> [[TMP66]] to i8 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP67]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP68]], i32 6 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7 -; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i8> [[VEC7]] -; - %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) - %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 - %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) - %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 - %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) - %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 - %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) - %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 - %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) - %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 - %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) - %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 - %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) - %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 - %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) - %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 - ret <8 x i8> %vec7 -} - -define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_ucmp_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1) -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0 -; CHECK-NEXT: [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[A0]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[A1]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP27:%.*]] = icmp ult <8 x i64> [[TMP22]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP27]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP31:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP33:%.*]] = and <8 x i1> [[TMP29]], [[TMP31]] -; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP29]], [[TMP32]] -; CHECK-NEXT: [[TMP36:%.*]] = or <8 x i1> [[TMP33]], [[TMP34]] -; CHECK-NEXT: [[TMP37:%.*]] = or <8 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP38:%.*]] = and <8 x i1> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8 -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP39]], i32 1 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP40]], i32 1 -; CHECK-NEXT: [[TMP41:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP42:%.*]] = and <8 x i64> [[A0]], [[TMP41]] -; CHECK-NEXT: [[TMP43:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP44:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP45:%.*]] = and <8 x i64> [[A1]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP47:%.*]] = icmp ule <8 x i64> [[TMP42]], [[TMP46]] -; CHECK-NEXT: [[TMP48:%.*]] = icmp ule <8 x i64> [[TMP43]], [[TMP45]] -; CHECK-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[TMP50:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP51:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP52:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP51]] -; CHECK-NEXT: [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP55:%.*]] = and <8 x i1> [[TMP49]], [[TMP52]] -; CHECK-NEXT: [[TMP56:%.*]] = or <8 x i1> [[TMP53]], [[TMP54]] -; CHECK-NEXT: [[TMP57:%.*]] = or <8 x i1> [[TMP56]], [[TMP55]] -; CHECK-NEXT: [[TMP58:%.*]] = and <8 x i1> [[TMP50]], [[TMP52]] -; CHECK-NEXT: [[TMP59:%.*]] = bitcast <8 x i1> [[TMP57]] to i8 -; CHECK-NEXT: [[TMP60:%.*]] = bitcast <8 x i1> [[TMP58]] to i8 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP59]], i32 2 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP60]], i32 2 -; CHECK-NEXT: [[TMP61:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP62:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP63:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]] -; CHECK-NEXT: [[TMP65:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]] -; CHECK-NEXT: [[TMP66:%.*]] = or <8 x i1> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP67:%.*]] = or <8 x i1> [[TMP66]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]] -; CHECK-NEXT: [[TMP69:%.*]] = bitcast <8 x i1> [[TMP67]] to i8 -; CHECK-NEXT: [[TMP70:%.*]] = bitcast <8 x i1> [[TMP68]] to i8 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP69]], i32 3 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP70]], i32 3 -; CHECK-NEXT: [[TMP71:%.*]] = xor <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP72:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP73:%.*]] = icmp ne <8 x i64> [[TMP72]], zeroinitializer -; CHECK-NEXT: [[TMP74:%.*]] = xor <8 x i64> [[TMP72]], splat (i64 -1) -; CHECK-NEXT: [[TMP75:%.*]] = and <8 x i64> [[TMP74]], [[TMP71]] -; CHECK-NEXT: [[TMP76:%.*]] = icmp eq <8 x i64> [[TMP75]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP73]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP78:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP79:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP80:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = and <8 x i1> [[TMP77]], [[TMP78]] -; CHECK-NEXT: [[TMP82:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP79]] -; CHECK-NEXT: [[TMP83:%.*]] = or <8 x i1> [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[TMP84:%.*]] = or <8 x i1> [[TMP83]], [[TMP82]] -; CHECK-NEXT: [[TMP85:%.*]] = and <8 x i1> [[TMP77]], [[TMP79]] -; CHECK-NEXT: [[TMP86:%.*]] = bitcast <8 x i1> [[TMP84]] to i8 -; CHECK-NEXT: [[TMP87:%.*]] = bitcast <8 x i1> [[TMP85]] to i8 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP86]], i32 4 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP87]], i32 4 -; CHECK-NEXT: [[TMP88:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP89:%.*]] = and <8 x i64> [[A0]], [[TMP88]] -; CHECK-NEXT: [[TMP90:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP91:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP92:%.*]] = and <8 x i64> [[A1]], [[TMP91]] -; CHECK-NEXT: [[TMP93:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP94:%.*]] = icmp uge <8 x i64> [[TMP89]], [[TMP93]] -; CHECK-NEXT: [[TMP95:%.*]] = icmp uge <8 x i64> [[TMP90]], [[TMP92]] -; CHECK-NEXT: [[TMP96:%.*]] = xor <8 x i1> [[TMP94]], [[TMP95]] -; CHECK-NEXT: [[TMP97:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP98:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP99:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP100:%.*]] = and <8 x i1> [[TMP96]], [[TMP98]] -; CHECK-NEXT: [[TMP101:%.*]] = and <8 x i1> [[TMP97]], [[TMP98]] -; CHECK-NEXT: [[TMP102:%.*]] = and <8 x i1> [[TMP96]], [[TMP99]] -; CHECK-NEXT: [[TMP103:%.*]] = or <8 x i1> [[TMP100]], [[TMP101]] -; CHECK-NEXT: [[TMP104:%.*]] = or <8 x i1> [[TMP103]], [[TMP102]] -; CHECK-NEXT: [[TMP105:%.*]] = and <8 x i1> [[TMP97]], [[TMP99]] -; CHECK-NEXT: [[TMP106:%.*]] = bitcast <8 x i1> [[TMP104]] to i8 -; CHECK-NEXT: [[TMP107:%.*]] = bitcast <8 x i1> [[TMP105]] to i8 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP106]], i32 5 -; CHECK-NEXT: [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP107]], i32 5 -; CHECK-NEXT: [[TMP108:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1) -; CHECK-NEXT: [[TMP109:%.*]] = and <8 x i64> [[A0]], [[TMP108]] -; CHECK-NEXT: [[TMP110:%.*]] = or <8 x i64> [[A0]], [[TMP1]] -; CHECK-NEXT: [[TMP111:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1) -; CHECK-NEXT: [[TMP112:%.*]] = and <8 x i64> [[A1]], [[TMP111]] -; CHECK-NEXT: [[TMP113:%.*]] = or <8 x i64> [[A1]], [[TMP2]] -; CHECK-NEXT: [[TMP114:%.*]] = icmp ugt <8 x i64> [[TMP109]], [[TMP113]] -; CHECK-NEXT: [[TMP115:%.*]] = icmp ugt <8 x i64> [[TMP110]], [[TMP112]] -; CHECK-NEXT: [[TMP116:%.*]] = xor <8 x i1> [[TMP114]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP118:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP119:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP120:%.*]] = and <8 x i1> [[TMP116]], [[TMP118]] -; CHECK-NEXT: [[TMP121:%.*]] = and <8 x i1> [[TMP117]], [[TMP118]] -; CHECK-NEXT: [[TMP122:%.*]] = and <8 x i1> [[TMP116]], [[TMP119]] -; CHECK-NEXT: [[TMP123:%.*]] = or <8 x i1> [[TMP120]], [[TMP121]] -; CHECK-NEXT: [[TMP124:%.*]] = or <8 x i1> [[TMP123]], [[TMP122]] -; CHECK-NEXT: [[TMP125:%.*]] = and <8 x i1> [[TMP117]], [[TMP119]] -; CHECK-NEXT: [[TMP126:%.*]] = bitcast <8 x i1> [[TMP124]] to i8 -; CHECK-NEXT: [[TMP127:%.*]] = bitcast <8 x i1> [[TMP125]] to i8 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP126]], i32 6 -; CHECK-NEXT: [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP127]], i32 6 -; CHECK-NEXT: [[TMP128:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP129:%.*]] = bitcast i8 [[MASK]] to <8 x i1> -; CHECK-NEXT: [[TMP130:%.*]] = and <8 x i1> zeroinitializer, [[TMP128]] -; CHECK-NEXT: [[TMP131:%.*]] = and <8 x i1> splat (i1 true), [[TMP128]] -; CHECK-NEXT: [[TMP132:%.*]] = and <8 x i1> zeroinitializer, [[TMP129]] -; CHECK-NEXT: [[TMP133:%.*]] = or <8 x i1> [[TMP130]], [[TMP131]] -; CHECK-NEXT: [[TMP134:%.*]] = or <8 x i1> [[TMP133]], [[TMP132]] -; CHECK-NEXT: [[TMP135:%.*]] = and <8 x i1> splat (i1 true), [[TMP129]] -; CHECK-NEXT: [[TMP136:%.*]] = bitcast <8 x i1> [[TMP134]] to i8 -; CHECK-NEXT: [[TMP137:%.*]] = bitcast <8 x i1> [[TMP135]] to i8 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP136]], i32 7 -; CHECK-NEXT: [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP137]], i32 7 -; CHECK-NEXT: store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i8> [[VEC7]] -; - %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) - %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 - %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) - %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 - %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) - %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 - %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) - %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 - %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) - %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 - %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) - %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 - %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) - %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 - %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) - %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 - ret <8 x i8> %vec7 -} - -declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone - -declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x float> [[TMP15]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP22]], <16 x i32> [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP17]], <16 x float> [[TMP15]], <16 x float> zeroinitializer -; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSPROP_SELECT]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[TMP4]], [[TMP14]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or <16 x i32> [[_MSPROP_SELECT3]], [[_MSPROP4]] -; CHECK-NEXT: [[RES5:%.*]] = fadd <16 x float> [[TMP23]], [[RES4]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES5]] -; - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) - %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res1, %res2 - %res5 = fadd <16 x float> %res3, %res4 - ret <16 x float> %res5 -} - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(ptr %x0ptr, <16 x float> %x2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X0:%.*]] = load <4 x float>, ptr [[X0PTR:%.*]], align 16 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP]] -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %x0 = load <4 x float>, ptr %x0ptr - %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_broadcastf64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcastf64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr, <8 x double> %x2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X0:%.*]] = load <4 x double>, ptr [[X0PTR:%.*]], align 32 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32 -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[_MSPROP]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP18]] -; - %x0 = load <4 x double>, ptr %x0ptr - %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) - ret <8 x double> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[MASK]] to <16 x i1> -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP19]], <16 x i32> [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[TMP13]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP4]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP21]], <16 x i32> [[_MSPROP_SELECT]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP12]], 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP22]], <16 x i32> [[_MSPROP_SELECT3]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP20]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP23]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(ptr %x0ptr, <16 x i32> %x2, i16 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X0:%.*]] = load <4 x i32>, ptr [[X0PTR:%.*]], align 16 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %x0 = load <4 x i32>, ptr %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_broadcasti64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcasti64x4_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP10]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8 x i64> %x2, i8 %mask) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X0:%.*]] = load <4 x i64>, ptr [[X0PTR:%.*]], align 32 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32 -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[X0]], <4 x i64> [[X0]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %x0 = load <4 x i64>, ptr %x0ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pabs_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false) -; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pabs_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false) -; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) - ret <8 x i64> %res -} - -define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) #0 { -; -; CHECK-LABEL: @test_vptestmq( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[A0:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[A1:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1) -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i1> [[TMP16]] to i8 -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[A0]], [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP1]], [[A1]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP25:%.*]] = xor <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = icmp ne <8 x i64> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i64> [[TMP26]], splat (i64 -1) -; CHECK-NEXT: [[TMP29:%.*]] = and <8 x i64> [[TMP28]], [[TMP25]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <8 x i64> [[TMP29]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP27]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[M:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP32]] -; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP32]] -; CHECK-NEXT: [[TMP36:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP33]] -; CHECK-NEXT: [[TMP37:%.*]] = or <8 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP38:%.*]] = or <8 x i1> [[TMP37]], [[TMP36]] -; CHECK-NEXT: [[TMP39:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]] -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8 -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or i8 [[TMP40]], [[TMP17]] -; CHECK-NEXT: [[RES2:%.*]] = add i8 [[TMP41]], [[TMP18]] -; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[RES2]] -; - %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) - %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) - %res2 = add i8 %res1, %res - ret i8 %res2 -} -declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8) - -define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) #0 { -; -; CHECK-LABEL: @test_vptestmd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[A0:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[A1:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1) -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i1> [[TMP16]] to i16 -; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i32> [[A0]], [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i32> [[TMP1]], [[A1]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i32> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i32> [[A0]], [[A1]] -; CHECK-NEXT: [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = icmp ne <16 x i32> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i32> [[TMP26]], splat (i32 -1) -; CHECK-NEXT: [[TMP29:%.*]] = and <16 x i32> [[TMP28]], [[TMP25]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <16 x i32> [[TMP29]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP27]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[M:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP32]] -; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP32]] -; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP33]] -; CHECK-NEXT: [[TMP37:%.*]] = or <16 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP38:%.*]] = or <16 x i1> [[TMP37]], [[TMP36]] -; CHECK-NEXT: [[TMP39:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]] -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16 -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or i16 [[TMP40]], [[TMP17]] -; CHECK-NEXT: [[RES2:%.*]] = add i16 [[TMP41]], [[TMP18]] -; CHECK-NEXT: store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[RES2]] -; - %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) - %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) - %res2 = add i16 %res1, %res - ret i16 %res2 -} -declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16) - -declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2) - -define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_ptestnm_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[X0:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[X1:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i32> [[X0]], [[X1]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1) -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP18]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i1> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16 -; CHECK-NEXT: [[TMP27:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP28:%.*]] = and <16 x i32> [[X0]], [[TMP2]] -; CHECK-NEXT: [[TMP29:%.*]] = and <16 x i32> [[TMP1]], [[X1]] -; CHECK-NEXT: [[TMP30:%.*]] = or <16 x i32> [[TMP27]], [[TMP28]] -; CHECK-NEXT: [[TMP31:%.*]] = or <16 x i32> [[TMP30]], [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = and <16 x i32> [[X0]], [[X1]] -; CHECK-NEXT: [[TMP33:%.*]] = xor <16 x i32> [[TMP32]], zeroinitializer -; CHECK-NEXT: [[TMP34:%.*]] = or <16 x i32> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i32> [[TMP34]], zeroinitializer -; CHECK-NEXT: [[TMP36:%.*]] = xor <16 x i32> [[TMP34]], splat (i32 -1) -; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i32> [[TMP36]], [[TMP33]] -; CHECK-NEXT: [[TMP38:%.*]] = icmp eq <16 x i32> [[TMP37]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = icmp eq <16 x i32> [[TMP32]], zeroinitializer -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16 -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or i16 [[TMP25]], [[TMP40]] -; CHECK-NEXT: [[RES2:%.*]] = add i16 [[TMP26]], [[TMP41]] -; CHECK-NEXT: store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[RES2]] -; - %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) - %res2 = add i16 %res, %res1 - ret i16 %res2 -} - -declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2) - -define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_ptestnm_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[X0:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[X1:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[X0]], [[X1]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1) -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i1> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP18]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i1> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i1> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i1> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <8 x i1> [[TMP24]] to i8 -; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP28:%.*]] = and <8 x i64> [[X0]], [[TMP2]] -; CHECK-NEXT: [[TMP29:%.*]] = and <8 x i64> [[TMP1]], [[X1]] -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP27]], [[TMP28]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = and <8 x i64> [[X0]], [[X1]] -; CHECK-NEXT: [[TMP33:%.*]] = xor <8 x i64> [[TMP32]], zeroinitializer -; CHECK-NEXT: [[TMP34:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i64> [[TMP34]], zeroinitializer -; CHECK-NEXT: [[TMP36:%.*]] = xor <8 x i64> [[TMP34]], splat (i64 -1) -; CHECK-NEXT: [[TMP37:%.*]] = and <8 x i64> [[TMP36]], [[TMP33]] -; CHECK-NEXT: [[TMP38:%.*]] = icmp eq <8 x i64> [[TMP37]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP35]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = icmp eq <8 x i64> [[TMP32]], zeroinitializer -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP1]] to i8 -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or i8 [[TMP25]], [[TMP40]] -; CHECK-NEXT: [[RES2:%.*]] = add i8 [[TMP26]], [[TMP41]] -; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[RES2]] -; - %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) - %res2 = add i8 %res, %res1 - ret i8 %res2 -} - -declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone -define i16 @test_kand(i16 %a0, i16 %a1) #0 { -; -; CHECK-LABEL: @test_kand( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i1> [[TMP3]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i1> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[TMP11]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i1> [[TMP13]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[TMP13]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = or <16 x i1> [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i1> [[TMP20]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]] -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16 -; CHECK-NEXT: store i16 [[TMP23]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP24]] -; - %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8) - %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1) - ret i16 %t2 -} - -declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone -define i16 @test_kandn(i16 %a0, i16 %a1) #0 { -; -; CHECK-LABEL: @test_kandn( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[_MSPROP]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[_MSPROP]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i1> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true) -; CHECK-NEXT: [[TMP19:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP17]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <16 x i1> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16 -; CHECK-NEXT: store i16 [[TMP25]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP26]] -; - %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8) - %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1) - ret i16 %t2 -} - -declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone -define i16 @test_knot(i16 %a0) #0 { -; -; CHECK-LABEL: @test_knot( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: store i16 [[TMP5]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP6]] -; - %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0) - ret i16 %res -} - -declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone -define i16 @test_kor(i16 %a0, i16 %a1) #0 { -; -; CHECK-LABEL: @test_kor( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP3]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 0), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 1), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 2), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 3), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 4), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 5), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 6), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 7), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 8), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 9), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 10), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 11), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 12), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 13), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 14), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 15), i1 true)> -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true) -; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i1> [[TMP17]], splat (i1 true) -; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]] -; CHECK-NEXT: [[TMP21:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]] -; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = or <16 x i1> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i1> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i1> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16 -; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i1> [[TMP25]] to i16 -; CHECK-NEXT: store i16 [[TMP26]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP27]] -; - %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8) - %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1) - ret i16 %t2 -} - -declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone -; TODO: the two kxnor instructions here a no op and should be elimintaed, -; probably by FoldConstantArithmetic in SelectionDAG. -define i16 @test_kxnor(i16 %a0, i16 %a1) #0 { -; -; CHECK-LABEL: @test_kxnor( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true) -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i1> [[_MSPROP]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP6]] to i16 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP8]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i1> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[TMP10]], splat (i1 true) -; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i1> [[_MSPROP2]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i1> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i1> [[_MSPROP3]] to i16 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 -; CHECK-NEXT: store i16 [[TMP15]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP16]] -; - %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8) - %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1) - ret i16 %t2 -} - -declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone -define i16 @test_kxor(i16 %a0, i16 %a1) #0 { -; -; CHECK-LABEL: @test_kxor( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[TMP5]] to i16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1> -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i1> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i1> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i1> [[TMP12]] to i16 -; CHECK-NEXT: store i16 [[TMP13]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP14]] -; - %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8) - %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1) - ret i16 %t2 -} - -declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone -define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) #0 { -; CHECK-LABEL: @test_kortestz( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1) -; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1) -; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 -; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true) -; CHECK-NEXT: [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true) -; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]] -; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]] -; CHECK-NEXT: [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]] -; CHECK-NEXT: [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]] -; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16 -; CHECK-NEXT: [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16 -; CHECK-NEXT: [[TMP44:%.*]] = xor i16 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = or i16 [[TMP42]], 0 -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = xor i16 [[TMP45]], -1 -; CHECK-NEXT: [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0 -; CHECK-NEXT: [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0 -; CHECK-NEXT: [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32 -; CHECK-NEXT: [[TMP51:%.*]] = zext i1 [[TMP50]] to i32 -; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[TMP51]] -; -entry: - %0 = bitcast <8 x i64> %A to <16 x i32> - %1 = bitcast <8 x i64> %B to <16 x i32> - %2 = icmp ne <16 x i32> %0, %1 - %3 = bitcast <8 x i64> %C to <16 x i32> - %4 = bitcast <8 x i64> %D to <16 x i32> - %5 = icmp ne <16 x i32> %3, %4 - %6 = bitcast <16 x i1> %2 to i16 - %7 = bitcast <16 x i1> %5 to i16 - %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) - ret i32 %res -} - -declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone -define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) #0 { -; CHECK-LABEL: @test_kortestc( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1) -; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1) -; CHECK-NEXT: [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16 -; CHECK-NEXT: [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true) -; CHECK-NEXT: [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true) -; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]] -; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]] -; CHECK-NEXT: [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]] -; CHECK-NEXT: [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]] -; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]] -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16 -; CHECK-NEXT: [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16 -; CHECK-NEXT: [[TMP44:%.*]] = xor i16 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = or i16 [[TMP42]], 0 -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = xor i16 [[TMP45]], -1 -; CHECK-NEXT: [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0 -; CHECK-NEXT: [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0 -; CHECK-NEXT: [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32 -; CHECK-NEXT: [[TMP51:%.*]] = zext i1 [[TMP50]] to i32 -; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[TMP51]] -; -entry: - %0 = bitcast <8 x i64> %A to <16 x i32> - %1 = bitcast <8 x i64> %B to <16 x i32> - %2 = icmp ne <16 x i32> %0, %1 - %3 = bitcast <8 x i64> %C to <16 x i32> - %4 = bitcast <8 x i64> %D to <16 x i32> - %5 = icmp ne <16 x i32> %3, %4 - %6 = bitcast <16 x i1> %2 to i16 - %7 = bitcast <16 x i1> %5 to i16 - %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) - ret i32 %res -} - -define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 { -; CHECK-LABEL: @test_cmpps( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16 -; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP7]] -; - %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) - ret i16 %res -} -declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) - -define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 { -; CHECK-LABEL: @test_cmppd( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8 -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[TMP7]] -; - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) - ret i8 %res -} -declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) - -define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mul_epi32_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) -; CHECK-NEXT: [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) -; CHECK-NEXT: [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32) -; CHECK-NEXT: [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP24]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %b = load <16 x i32>, ptr %ptr_b - %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]] -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %b = load <16 x i32>, ptr %ptr_b - %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32) -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP25]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epi32_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32) -; CHECK-NEXT: [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32) -; CHECK-NEXT: [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32) -; CHECK-NEXT: [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) - -define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) #0 { -; CHECK-LABEL: @test_mul_epu32_rr( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP19]] -; - %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rrk( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]] -; CHECK-NEXT: [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rrkz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP24]] -; - %b = load <16 x i32>, ptr %ptr_b - %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rmk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %b = load <16 x i32>, ptr %ptr_b - %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rmkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]] -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %b = load <16 x i32>, ptr %ptr_b - %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rmb( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP25]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rmbk( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru - ret < 8 x i64> %res -} - -define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mul_epu32_rmbkz( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32> -; CHECK-NEXT: [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295) -; CHECK-NEXT: [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer -; CHECK-NEXT: [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]] -; CHECK-NEXT: [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %q = load i64, ptr %ptr_b - %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 - %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer - %b = bitcast <8 x i64> %b64 to <16 x i32> - %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer - ret < 8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) - -define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b) -; -; CHECK-LABEL: @test_x86_avx512_mm_cvtu32_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[B:%.*]] to double -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP4]], i64 0 -; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[TMP5]] -; - #0 { - %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1] - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone - -define <16 x float> @test_x86_vbroadcast_ss_512(ptr %a0) #0 { -; -; CHECK-LABEL: @test_x86_vbroadcast_ss_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 3: -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A0:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x float> poison, float [[TMP4]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <16 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <16 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <16 x i32> [[_MSPROP3]], i32 [[_MSLD]], i32 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x float> [[TMP11]], float [[TMP4]], i32 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <16 x i32> [[_MSPROP4]], i32 [[_MSLD]], i32 5 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x float> [[TMP12]], float [[TMP4]], i32 5 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i32> [[_MSPROP5]], i32 [[_MSLD]], i32 6 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x float> [[TMP13]], float [[TMP4]], i32 6 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <16 x i32> [[_MSPROP6]], i32 [[_MSLD]], i32 7 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x float> [[TMP14]], float [[TMP4]], i32 7 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i32> [[_MSPROP7]], i32 [[_MSLD]], i32 8 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> [[TMP15]], float [[TMP4]], i32 8 -; CHECK-NEXT: [[_MSPROP9:%.*]] = insertelement <16 x i32> [[_MSPROP8]], i32 [[_MSLD]], i32 9 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP4]], i32 9 -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <16 x i32> [[_MSPROP9]], i32 [[_MSLD]], i32 10 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP4]], i32 10 -; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <16 x i32> [[_MSPROP10]], i32 [[_MSLD]], i32 11 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP4]], i32 11 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <16 x i32> [[_MSPROP11]], i32 [[_MSLD]], i32 12 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 12 -; CHECK-NEXT: [[_MSPROP13:%.*]] = insertelement <16 x i32> [[_MSPROP12]], i32 [[_MSLD]], i32 13 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP4]], i32 13 -; CHECK-NEXT: [[_MSPROP14:%.*]] = insertelement <16 x i32> [[_MSPROP13]], i32 [[_MSLD]], i32 14 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP4]], i32 14 -; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i32> [[_MSPROP14]], i32 [[_MSLD]], i32 15 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP4]], i32 15 -; CHECK-NEXT: store <16 x i32> [[_MSPROP15]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP23]] -; - %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr %a0) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr) nounwind readonly - -define <8 x double> @test_x86_vbroadcast_sd_512(ptr %a0) #0 { -; -; CHECK-LABEL: @test_x86_vbroadcast_sd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 3: -; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[A0:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP4]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP4]], i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP4]], i32 3 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP4]], i32 5 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP4]], i32 6 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP4]], i32 7 -; CHECK-NEXT: store <8 x i64> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP15]] -; - %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr %a0) ; <<8 x double>> [#uses=1] - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr) nounwind readonly - -declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP7]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP18]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) - ret <8 x double> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) - ret <16 x float> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) - -define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) - -define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) - -define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP9]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) - -define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP10]] -; - %x2 = load <16 x i32>, ptr %x2p - %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %x2 = load <16 x i32>, ptr %x2p - %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double> -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP9]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64> -; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP20]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float> -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP9]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP20]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %x2 = load <16 x i32>, ptr %x2p - %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]]) -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64> -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP23]] -; - %x2s = load double, ptr %x2ptr - %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 - %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer - %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - ret <8 x double> %res -} - -declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - ret <16 x float> %res -} - - -declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP4]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) - -define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 9) - ret <16 x float> %res -} - -define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 11) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 9) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 11) - ret <16 x float> %res -} - -;; mask float -define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 9) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 11) - ret <16 x float> %res -} - -;; With Passthru value -define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_passthru_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_passthru_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 9) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_passthru_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_vmulps_mask_passthru_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 11) - ret <16 x float> %res -} - -;; mask double -define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_vmulpd_mask_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 8) - ret <8 x double> %res -} - -define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_vmulpd_mask_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 9) - ret <8 x double> %res -} - -define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_vmulpd_mask_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 10) - ret <8 x double> %res -} - -define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; -; CHECK-LABEL: @test_vmulpd_mask_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 11) - ret <8 x double> %res -} - -define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_add_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) - -define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_div_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res -} - - -define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} -define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9) - ret <16 x float> %res -} -define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) - -define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_store_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask) - ret void -} - -declare void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask) - -define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data) #0 { -; -; CHECK-LABEL: @test_compress_store_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true)) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true)) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 -1) - ret void -} - -define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_store_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask) - ret void -} - -declare void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask) - -define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data) #0 { -; -; CHECK-LABEL: @test_compress_store_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true)) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true)) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 -1) - ret void -} - -define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_store_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask) - ret void -} - -declare void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask) - -define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data) #0 { -; -; CHECK-LABEL: @test_compress_store_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true)) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true)) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 -1) - ret void -} - -define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_store_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask) - ret void -} - -declare void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask) - -define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data) #0 { -; -; CHECK-LABEL: @test_compress_store_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true)) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true)) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 -1) - ret void -} - -define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_load_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP12]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask) - ret <8 x double> %res -} - -define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_load_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> zeroinitializer, i8 %mask) - ret <8 x double> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask) - -define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data) #0 { -; -; CHECK-LABEL: @test_expand_load_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP8]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 -1) - ret <8 x double> %res -} - -; Make sure we don't crash if you pass 0 to the mask. -define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask) #0 { -; CHECK-LABEL: @test_zero_mask_expand_load_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> zeroinitializer, <8 x i64> [[TMP2]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP8]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 0) - ret <8 x double> %res -} - -define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_load_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP12]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask) - ret <16 x float> %res -} - -define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_load_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer) -; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> zeroinitializer, i16 %mask) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask) - -define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data) #0 { -; -; CHECK-LABEL: @test_expand_load_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP8]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 -1) - ret <16 x float> %res -} - -define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_load_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_load_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask) - -define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data) #0 { -; -; CHECK-LABEL: @test_expand_load_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP8]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 -1) - ret <8 x i64> %res -} - -define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_load_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_load_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer) -; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask) - -define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data) #0 { -; -; CHECK-LABEL: @test_expand_load_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080 -; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr -; CHECK-NEXT: [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP8]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 -1) - ret <16 x i32> %res -} - -define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_min_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_min_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_min_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) - -define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mm512_mask_max_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_max_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} - -define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_max_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) - -define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_sqrt_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) -; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> undef, i8 -1, i32 4) - ret <8 x double> %res -} -define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sqrt_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> %passthru, i8 %mask, i32 4) - ret <8 x double> %res -} -define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_sqrt_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) - ret <8 x double> %res -} -define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_sqrt_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP5]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> undef, i8 -1, i32 11) - ret <8 x double> %res -} -define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sqrt_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> %passthru, i8 %mask, i32 11) - ret <8 x double> %res -} -define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_sqrt_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP14]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 11) - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone - -define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_sqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) -; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> undef, i16 -1, i32 4) - ret <16 x float> %res -} -define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 4) - ret <16 x float> %res -} -define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_sqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res -} -define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_sqrt_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP5]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 11) - ret <16 x float> %res -} -define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_sqrt_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 11) - ret <16 x float> %res -} -define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_sqrt_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP14]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 11) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP7]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP16]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - ret <16 x i32> %res -} - -define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP15]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP7]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP16]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res -} - -define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512_old( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP15]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - ret <8 x i64> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prol_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_prol_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -declare <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32>, i32, <16 x i32>, i16) - -define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pror_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2 -; CHECK-NEXT: store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1) - %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0 - %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res3, <16 x i32> %res1, 1 - %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } %res4, <16 x i32> %res2, 2 - ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5 -} - -declare <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64>, i32, <8 x i64>, i8) - -define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pror_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3)) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]] -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4)) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]] -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5)) -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0 -; CHECK-NEXT: [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1 -; CHECK-NEXT: [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2 -; CHECK-NEXT: [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2 -; CHECK-NEXT: store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1) - %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0 - %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res3, <8 x i64> %res1, 1 - %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } %res4, <8 x i64> %res2, 2 - ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5 -} - -declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP5]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP27]] to i64 -; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP5]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP27]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - -declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP16]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP11]], double [[TMP8]], double 0.000000e+00 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[X0]], double [[TMP17]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]] -; CHECK-NEXT: [[_MSCMP15:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]] -; CHECK-NEXT: br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] -; CHECK: 22: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 23: -; CHECK-NEXT: [[TMP24:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]], i32 11) -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 0, i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast double [[TMP24]] to i64 -; CHECK-NEXT: [[TMP30:%.*]] = xor i64 [[TMP29]], 0 -; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[TMP30]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP31]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP32]], i64 [[TMP28]] -; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP27]], double [[TMP24]], double 0.000000e+00 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT11]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x double> [[X0]], double [[TMP33]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP12]] -; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[TMP18]], [[TMP34]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP13]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES2]] -; - %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 11) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 -} - -declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP13]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[TMP14]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP16]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP11]], float [[TMP8]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[X0]], float [[TMP17]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]] -; CHECK-NEXT: [[_MSCMP15:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]] -; CHECK-NEXT: br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] -; CHECK: 22: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 23: -; CHECK-NEXT: [[TMP24:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]], i32 11) -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast float [[TMP24]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP29]], 0 -; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[TMP31]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i32 [[TMP32]], i32 [[TMP28]] -; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP27]], float [[TMP24]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT11]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[X0]], float [[TMP33]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP12]] -; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[TMP18]], [[TMP34]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP18]] -; - %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 11) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res -} -declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP7]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP29]] to i64 -; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) - %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP7]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP29]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) - %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - -define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { -; -; CHECK-LABEL: @fmadd_ss_mask_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 -; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 -; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 -; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] -; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast float [[TMP14]] to i32 -; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]] -; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]] -; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0 -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] -; CHECK: 29: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 30: -; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 -; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr -; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP33]], align 4 -; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 -; CHECK-NEXT: ret void -; - %a.val = load float, ptr %a - %av0 = insertelement <4 x float> undef, float %a.val, i32 0 - %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 - %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 - %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 - - %b.val = load float, ptr %b - %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 - %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 - %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 - %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 - - %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) - - %sr = extractelement <4 x float> %vr, i32 0 - store float %sr, ptr %a - ret void -} - -define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { -; -; CHECK-LABEL: @fmadd_ss_maskz_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 -; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 -; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 -; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] -; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = xor i32 [[TMP22]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]] -; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0 -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 -; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr -; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP32]], align 4 -; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 -; CHECK-NEXT: ret void -; - %a.val = load float, ptr %a - %av0 = insertelement <4 x float> undef, float %a.val, i32 0 - %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 - %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 - %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 - - %b.val = load float, ptr %b - %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 - %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 - %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 - %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 - - %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) - - %sr = extractelement <4 x float> %vr, i32 0 - store float %sr, ptr %a - ret void -} - -define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { -; -; CHECK-LABEL: @fmadd_sd_mask_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 -; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] -; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] -; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast double [[TMP14]] to i64 -; CHECK-NEXT: [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]] -; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]] -; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] -; CHECK: 29: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 30: -; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 -; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr -; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP33]], align 8 -; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 -; CHECK-NEXT: ret void -; - %a.val = load double, ptr %a - %av0 = insertelement <2 x double> undef, double %a.val, i32 0 - %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 - - %b.val = load double, ptr %b - %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 - %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 - - %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) - - %sr = extractelement <2 x double> %vr, i32 0 - store double %sr, ptr %a - ret void -} - -define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { -; -; CHECK-LABEL: @fmadd_sd_maskz_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 -; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] -; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] -; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = xor i64 [[TMP22]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]] -; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00 -; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 -; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr -; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP32]], align 8 -; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 -; CHECK-NEXT: ret void -; - %a.val = load double, ptr %a - %av0 = insertelement <2 x double> undef, double %a.val, i32 0 - %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 - - %b.val = load double, ptr %b - %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 - %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 - - %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) - - %sr = extractelement <2 x double> %vr, i32 0 - store double %sr, ptr %a - ret void -} - -declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double [[TMP9]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] -; CHECK: 26: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 27: -; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] -; CHECK: 35: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 36: -; CHECK-NEXT: [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]] -; CHECK-NEXT: [[TMP43:%.*]] = bitcast double [[TMP37]] to i64 -; CHECK-NEXT: [[TMP44:%.*]] = bitcast double [[TMP38]] to i64 -; CHECK-NEXT: [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]] -; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) - %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float [[TMP9]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] -; CHECK: 26: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 27: -; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] -; CHECK: 35: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 36: -; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]] -; CHECK-NEXT: [[TMP43:%.*]] = bitcast float [[TMP37]] to i32 -; CHECK-NEXT: [[TMP44:%.*]] = bitcast float [[TMP38]] to i32 -; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]] -; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) - %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - -declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast double [[TMP11]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = fneg <2 x double> [[X0]] -; CHECK-NEXT: [[TMP24:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = fneg <2 x double> [[X0]] -; CHECK-NEXT: [[TMP34:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] -; CHECK: 38: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 39: -; CHECK-NEXT: [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]] -; CHECK-NEXT: [[TMP46:%.*]] = bitcast double [[TMP40]] to i64 -; CHECK-NEXT: [[TMP47:%.*]] = bitcast double [[TMP41]] to i64 -; CHECK-NEXT: [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = or i64 [[TMP48]], 0 -; CHECK-NEXT: [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]] -; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11) - %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP11]] to i32 -; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = fneg <4 x float> [[X0]] -; CHECK-NEXT: [[TMP24:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = fneg <4 x float> [[X0]] -; CHECK-NEXT: [[TMP34:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] -; CHECK: 38: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 39: -; CHECK-NEXT: [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]] -; CHECK-NEXT: [[TMP46:%.*]] = bitcast float [[TMP40]] to i32 -; CHECK-NEXT: [[TMP47:%.*]] = bitcast float [[TMP41]] to i32 -; CHECK-NEXT: [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], 0 -; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]] -; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11) - %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - -define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] -; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP12]] to i32 -; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] -; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP24]] -; - %q = load float, ptr %ptr_b - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) - ret < 4 x float> %res -} - -define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] -; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP10]] to i32 -; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] -; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP24]] -; - %q = load float, ptr %ptr_b - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) - ret < 4 x float> %res -} - - -define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] -; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP12]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]] -; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP19]] -; - %q = load float, ptr %ptr_b - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) - ret < 4 x float> %res -} - -declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> -; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[TMP2]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - ret <8 x i32> %res -} - -define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]] -; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[TMP11]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - ret <8 x i32> %res -} - -define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer -; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[TMP10]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - ret <8 x i32> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) - -define <16 x float> @test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer -; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 -} - -declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32) - -define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { -; -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer -; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 -} - -define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask) - ret <8 x double> %res -} - -define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_compress_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP9]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) - ret <8 x double> %res -} - -define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 { -; CHECK-LABEL: @test_compress_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> undef, i8 -1) - ret <8 x double> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) - -define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask) - ret <16 x float> %res -} - -define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_compress_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP9]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask) - ret <16 x float> %res -} - -define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 { -; CHECK-LABEL: @test_compress_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> undef, i16 -1) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask) - -define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_compress_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP9]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 { -; CHECK-LABEL: @test_compress_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask) - -define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_compress_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_compress_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 { -; CHECK-LABEL: @test_compress_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask) - -define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 { -; CHECK-LABEL: @test_expand_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> undef, i8 -1) - ret <8 x double> %res -} - -define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask) - ret <8 x double> %res -} - -define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP9]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) - ret <8 x double> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) - -define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 { -; CHECK-LABEL: @test_expand_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> undef, i16 -1) - ret <16 x float> %res -} - -define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask) - ret <16 x float> %res -} - -define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP9]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask) - -define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 { -; CHECK-LABEL: @test_expand_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1) - ret <8 x i64> %res -} - -define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) - ret <8 x i64> %res -} - -define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP9]] -; - %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask) - ret <8 x i64> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask) - -define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 { -; CHECK-LABEL: @test_expand_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1) - ret <16 x i32> %res -} - -define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { -; -; CHECK-LABEL: @test_mask_expand_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) - ret <16 x i32> %res -} - -define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 { -; -; CHECK-LABEL: @test_maskz_expand_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask) - ret <16 x i32> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask) - -define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, ptr %p) #0 { -; -; CHECK-LABEL: @test_cmp_512( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 1, <16 x i1> splat (i1 true), i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[C:%.*]], <16 x float> [[D:%.*]], i32 1, <16 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP5]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr [[P:%.*]], align 64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[P]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], 87960930222080 -; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP20]], align 64 -; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i1> [[TMP9]], [[TMP14]] -; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> zeroinitializer, <16 x i32> [[_MSLD]] -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x float> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i32> zeroinitializer, [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = or <16 x i32> [[TMP25]], [[_MSLD]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP26]], <16 x i32> [[TMP22]] -; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP21]], <16 x float> zeroinitializer, <16 x float> [[TMP17]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP27]] -; - entry: - %0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8) - %1 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i32 4) - %2 = load <16 x float>, ptr %p - %3 = xor <16 x i1> %0, %1 - %4 = select <16 x i1> %3, <16 x float> zeroinitializer, <16 x float> %2 - ret <16 x float> %4 -} - -declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) - -attributes #0 = { sanitize_memory } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll deleted file mode 100644 index 052b497831ee1..0000000000000 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll +++ /dev/null @@ -1,13714 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s -; -; Forked from llvm/test/CodeGen/X86/avx512-intrinsics.ll - -define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_compress_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1) - ret <8 x double> %2 -} - -define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_compress_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP9]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1) - ret <8 x double> %2 -} - -define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 { -; CHECK-LABEL: @test_compress_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> ) - ret <8 x double> %1 -} - -define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_mask_compress_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1) - ret <16 x float> %2 -} - -define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 { -; CHECK-LABEL: @test_maskz_compress_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP9]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1) - ret <16 x float> %2 -} - -define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 { -; CHECK-LABEL: @test_compress_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> ) - ret <16 x float> %1 -} - -define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_compress_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1) - ret <8 x i64> %2 -} - -define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_compress_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP9]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1) - ret <8 x i64> %2 -} - -define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 { -; CHECK-LABEL: @test_compress_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> ) - ret <8 x i64> %1 -} - -define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_mask_compress_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1) - ret <16 x i32> %2 -} - -define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 { -; CHECK-LABEL: @test_maskz_compress_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1) - ret <16 x i32> %2 -} - -define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 { -; CHECK-LABEL: @test_compress_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> ) - ret <16 x i32> %1 -} - -define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 { -; CHECK-LABEL: @test_expand_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> ) - ret <8 x double> %1 -} - -define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_expand_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1) - ret <8 x double> %2 -} - -define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_expand_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP9]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1) - ret <8 x double> %2 -} - -define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 { -; CHECK-LABEL: @test_expand_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> ) - ret <16 x float> %1 -} - -define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_mask_expand_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1) - ret <16 x float> %2 -} - -define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 { -; CHECK-LABEL: @test_maskz_expand_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP9]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1) - ret <16 x float> %2 -} - -define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 { -; CHECK-LABEL: @test_expand_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true)) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> ) - ret <8 x i64> %1 -} - -define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_expand_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1) - ret <8 x i64> %2 -} - -define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_expand_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP9]] -; - %1 = bitcast i8 %mask to <8 x i1> - %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1) - ret <8 x i64> %2 -} - -define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 { -; CHECK-LABEL: @test_expand_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true)) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> ) - ret <16 x i32> %1 -} - -define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_mask_expand_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1) - ret <16 x i32> %2 -} - -define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 { -; CHECK-LABEL: @test_maskz_expand_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %1 = bitcast i16 %mask to <16 x i1> - %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1) - ret <16 x i32> %2 -} - -define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_rcp_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone - -define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_rcp_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1] - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone - -declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32) - -define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) #0 { -; CHECK-LABEL: @test_rndscale_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 11, i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4) - ret <2 x double>%res -} - -define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 { -; CHECK-LABEL: @test_rndscale_sd_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4) - ret <2 x double>%res -} - -define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) #0 { -; CHECK-LABEL: @test_rndscale_sd_mask_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[B:%.*]] = load <2 x double>, ptr [[BPTR:%.*]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[BPTR]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %b = load <2 x double>, ptr %bptr - %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4) - ret <2 x double>%res -} - -define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) #0 { -; CHECK-LABEL: @test_rndscale_sd_maskz( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4) - ret <2 x double>%res -} - -declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32) - -define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) #0 { -; CHECK-LABEL: @test_rndscale_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 11, i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4) - ret <4 x float>%res -} - -define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr) #0 { -; CHECK-LABEL: @test_rndscale_ss_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load <4 x float>, ptr [[BPTR:%.*]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[BPTR]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B]], <4 x float> undef, i8 -1, i32 11, i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %b = load <4 x float>, ptr %bptr - %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4) - ret <4 x float>%res -} - -define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 { -; CHECK-LABEL: @test_rndscale_ss_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4) - ret <4 x float>%res -} - -define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) #0 { -; CHECK-LABEL: @test_rndscale_ss_maskz( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4) - ret <4 x float>%res -} - -declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) - -define <8 x double> @test7(<8 x double> %a) #0 { -; CHECK-LABEL: @test7( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> [[A:%.*]], i32 11, <8 x double> [[A]], i8 -1, i32 4) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4) - ret <8 x double>%res -} - -declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) - -define <16 x float> @test8(<16 x float> %a) #0 { -; CHECK-LABEL: @test8( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> [[A:%.*]], i32 11, <16 x float> [[A]], i16 -1, i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4) - ret <16 x float>%res -} - -define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_rsqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone - -define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_sqrt_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) -; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) - ret <8 x double> %1 -} - -define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_sqrt_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP13]] -; - %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_sqrt_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP11]] -; - %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} -declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) - -define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_sqrt_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP5]] -; - %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) - ret <8 x double> %1 -} - -define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_sqrt_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_sqrt_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP14]] -; - %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} -declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone - -define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_sqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) -; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) - ret <16 x float> %1 -} - -define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_mask_sqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP13]] -; - %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 { -; CHECK-LABEL: @test_maskz_sqrt_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP11]] -; - %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} -declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) - -define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_sqrt_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP5]] -; - %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) - ret <16 x float> %1 -} - -define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_mask_sqrt_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 { -; CHECK-LABEL: @test_maskz_sqrt_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP14]] -; - %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} -declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone - -define <8 x double> @test_getexp_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_getexp_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) - ret <8 x double> %res -} -define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) #0 { -; CHECK-LABEL: @test_getexp_round_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 12) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 12) - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone - -define <16 x float> @test_getexp_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_getexp_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} - -define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) #0 { -; CHECK-LABEL: @test_getexp_round_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - -define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_sqrt_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] -; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 9) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]] -; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] -; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 [[MASK]], i32 10) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0 -; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]] -; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] -; CHECK: 21: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 22: -; CHECK-NEXT: [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 -1, i32 11) -; CHECK-NEXT: [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES_2:%.*]] = fadd <4 x float> [[RES2]], [[RES3]] -; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[RES_1]], [[RES_2]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9) - %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10) - %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11) - - %res.1 = fadd <4 x float> %res0, %res1 - %res.2 = fadd <4 x float> %res2, %res3 - %res = fadd <4 x float> %res.1, %res.2 - ret <4 x float> %res -} - -declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone - -define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_sqrt_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] -; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 9) -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]] -; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] -; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 [[MASK]], i32 10) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0 -; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]] -; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] -; CHECK: 21: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 22: -; CHECK-NEXT: [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 -1, i32 11) -; CHECK-NEXT: [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES_2:%.*]] = fadd <2 x double> [[RES2]], [[RES3]] -; CHECK-NEXT: [[RES:%.*]] = fadd <2 x double> [[RES_1]], [[RES_2]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9) - %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10) - %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11) - - %res.1 = fadd <2 x double> %res0, %res1 - %res.2 = fadd <2 x double> %res2, %res3 - %res = fadd <2 x double> %res.1, %res.2 - ret <2 x double> %res -} - -define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvttsd2usi( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0:%.*]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0]], i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES2]] -; - %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ; - %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ; - %res2 = add i32 %res0, %res1 - ret i32 %res2 -} -declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvttsd2si( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0:%.*]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0]], i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES2]] -; - %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ; - %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ; - %res2 = add i32 %res0, %res1 - ret i32 %res2 -} -declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvttss2si( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0:%.*]], i32 8) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0]], i32 4) -; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES2]] -; - %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ; - %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ; - %res2 = add i32 %res0, %res1 - ret i32 %res2 -} -declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvttss2si_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 3: -; CHECK-NEXT: [[A1:%.*]] = load <4 x float>, ptr [[A0:%.*]], align 16 -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[A0]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080 -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A1]], i32 4) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %a1 = load <4 x float>, ptr %a0 - %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ; - ret i32 %res -} - -define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvttss2usi( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0:%.*]], i32 8) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0]], i32 4) -; CHECK-NEXT: [[RES2:%.*]] = add i32 [[RES0]], [[RES1]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES2]] -; - %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ; - %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ; - %res2 = add i32 %res0, %res1 - ret i32 %res2 -} -declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvtsd2usi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0:%.*]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 9) -; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES4]] -; - %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) - %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11) - %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9) - %res3 = add i32 %res, %res1 - %res4 = add i32 %res3, %res2 - ret i32 %res4 -} -declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvtsd2si32( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0:%.*]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 9) -; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES4]] -; - %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) - %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11) - %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9) - %res3 = add i32 %res, %res1 - %res4 = add i32 %res3, %res2 - ret i32 %res4 -} -declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvtss2usi32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0:%.*]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 9) -; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES4]] -; - %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) - %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11) - %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9) - %res3 = add i32 %res, %res1 - %res4 = add i32 %res3, %res2 - ret i32 %res4 -} -declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone - -define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_cvtss2si32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0:%.*]], i32 4) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 11) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 9) -; CHECK-NEXT: [[RES3:%.*]] = add i32 [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add i32 [[RES3]], [[RES2]] -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES4]] -; - %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) - %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11) - %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9) - %res3 = add i32 %res, %res1 - %res4 = add i32 %res3, %res2 - ret i32 %res4 -} -declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone - -define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) #0 { -; CHECK-LABEL: @test_x86_vcvtps2ph_256( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]]) -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]] -; CHECK: 15: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 16: -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080 -; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr -; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32 -; CHECK-NEXT: store <16 x i16> [[RES1]], ptr [[DST]], align 32 -; CHECK-NEXT: [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]] -; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i16> [[RES]] -; - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask) - %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask) - store <16 x i16> %res1, ptr %dst - %res = add <16 x i16> %res2, %res3 - ret <16 x i16> %res -} - -declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly - -define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 { -; CHECK-LABEL: @test_cmpps( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16 -; CHECK-NEXT: store i16 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i16 [[TMP7]] -; - %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> , i32 8) - %1 = bitcast <16 x i1> %res to i16 - ret i16 %1 -} -declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) - -define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 { -; CHECK-LABEL: @test_cmppd( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8 -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[TMP7]] -; - %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> , i32 4) - %1 = bitcast <8 x i1> %res to i8 - ret i8 %1 -} -declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32) - - - ; fp min - max -define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) #0 { -; CHECK-LABEL: @test_vmaxpd( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP7]] -; - %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) - ret <8 x double> %1 -} -declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) - -define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) #0 { -; CHECK-LABEL: @test_vminpd( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP7]] -; - %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) - ret <8 x double> %1 -} -declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) - -define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_store_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = and i8 [[TMP1]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = and i8 [[MASK]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP9]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 -; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]]) -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] -; CHECK: 16: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 17: -; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]]) -; CHECK-NEXT: ret void -; - %1 = and i8 %mask, 1 - %2 = bitcast i8 %1 to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - call void @llvm.masked.store.v4f32.p0(<4 x float> %data, ptr %ptr, i32 1, <4 x i1> %extract) - ret void -} -declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) #1 - - -declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) -declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) -declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32) - -define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - ret <16 x float> %1 -} - -define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - ret <16 x float> %1 -} - -define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vsubps_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - ret <16 x float> %1 -} - -define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - ret <16 x float> %1 -} - -define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - ret <16 x float> %1 -} - -define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 { -; CHECK-LABEL: @test_vmulps_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - ret <16 x float> %1 -} - -define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_passthru_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_passthru_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_passthru_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_vmulps_mask_passthru_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_vmulpd_mask_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_vmulpd_mask_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_vmulpd_mask_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_vmulpd_mask_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_add_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_add_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - ret <16 x float> %1 -} -declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) - -define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_sub_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_div_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_div_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - ret <16 x float> %1 -} -declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) - -define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_min_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_min_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_min_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - ret <16 x float> %1 -} -declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) - -define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_mask_max_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src - ret <16 x float> %3 -} - -define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_max_round_ps_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8) - ret <16 x float> %1 -} - -define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_mm512_max_round_ps_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) - ret <16 x float> %1 -} -declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) - -declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - -define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_ss_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_ss_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 9) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9) - ret <4 x float> %res -} - -define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_ss_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 10) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10) - ret <4 x float> %res -} - -define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_ss_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 11) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11) - ret <4 x float> %res -} - -define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_ss_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) - ret <4 x float> %res -} - -define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_add_ss_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 { -; CHECK-LABEL: @test_add_ss_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_ss_current_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %a1.val = load float, ptr %a1 - %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 - %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 - %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 - %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) - ret <4 x float> %res -} - -define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_add_ss_current_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %a1.val = load float, ptr %a1 - %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 - %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 - %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 - %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 - %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) - ret <4 x float> %res -} - -declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone - -define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_sd_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) - ret <2 x double> %res -} - -define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_sd_rd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 9) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9) - ret <2 x double> %res -} - -define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_sd_ru( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 10) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10) - ret <2 x double> %res -} - -define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_sd_rz( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 11) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11) - ret <2 x double> %res -} - -define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_sd_current( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_add_sd_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) - ret <2 x double> %res -} - -define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_add_sd_rn( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8) - ret <2 x double> %res -} - -define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_add_sd_current_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %a1.val = load double, ptr %a1 - %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 - %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_add_sd_current_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %a1.val = load double, ptr %a1 - %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 - %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 - %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) - ret <2 x double> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - -define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_max_ss_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_max_ss_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 { -; CHECK-LABEL: @test_max_ss_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_max_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) - ret <4 x float> %res -} - -define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_max_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4) - ret <4 x float> %res -} - -define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 { -; CHECK-LABEL: @test_max_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4) - ret <4 x float> %res -} - -define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_max_ss_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %a1.val = load float, ptr %a1 - %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 - %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 - %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 - %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) - ret <4 x float> %res -} - -define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_max_ss_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %a1.val = load float, ptr %a1 - %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 - %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 - %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 - %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 - %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) - ret <4 x float> %res -} -declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone - -define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_max_sd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) - ret <2 x double> %res -} - -define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_max_sd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) - ret <2 x double> %res -} - -define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_max_sd_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8) - ret <2 x double> %res -} - -define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_max_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_max_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_max_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_max_sd_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] -; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %a1.val = load double, ptr %a1 - %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 - %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_max_sd_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 12: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %a1.val = load double, ptr %a1 - %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 - %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 - %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) - ret <2 x double> %res -} - -define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) #0 { -; CHECK-LABEL: @test_x86_avx512_cvtsi2ss32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 11) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone - -define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) #0 { -; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 9) -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1] - ret <4 x float> %res -} - -define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) #0 { -; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss_mem( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 9) -; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %b = load i32, ptr %ptr - %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1] - ret <4 x float> %res -} - -define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 { -; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 4) -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1] - ret <4 x float> %res -} - -define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) #0 { -; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss_mem( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 4) -; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %b = load i32, ptr %ptr - %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone - -declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) - -define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) - ret <16 x i32> %1 -} - -define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 - ret <16 x i32> %3 -} - -declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) - -define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP9]] -; - %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) - ret <8 x double> %1 -} - -define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64> -; CHECK-NEXT: [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP20]] -; - %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) - %2 = bitcast <8 x i64> %x1 to <8 x double> - %3 = bitcast i8 %x3 to <8 x i1> - %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2 - ret <8 x double> %4 -} - -declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) - -define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP9]] -; - %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) - ret <16 x float> %1 -} - -define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP20]] -; - %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) - %2 = bitcast <16 x i32> %x1 to <16 x float> - %3 = bitcast i16 %x3 to <16 x i1> - %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 - ret <16 x float> %4 -} - -declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) - -define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP4]] -; - %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) - ret <8 x i64> %1 -} - -define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 - ret <8 x i64> %3 -} - -define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - ret <16 x i32> %3 -} - -define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer -; CHECK-NEXT: [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]]) -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64> -; CHECK-NEXT: [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP23]] -; - %x2s = load double, ptr %x2ptr - %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 - %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer - %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - ret <8 x i64> %3 -} - -define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP4]] -; - %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) - ret <16 x i32> %1 -} - -define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 - ret <16 x i32> %3 -} - -declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) -define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x double> [[X2]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11) - %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 -} - -declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x float> [[X2]], i16 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10) - %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 -} - -declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8) - -define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i8> [[RES4]] -; - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 -} - -declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8) - -define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i8> [[RES4]] -; - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8) - -define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i8> [[RES4]] -; - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) - -define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[RES4]] -; - %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res3 = add <8 x i16> %res0, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 -} - -declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) - -define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[RES4]] -; - %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res3 = add <8 x i16> %res0, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) - -define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]] -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i16> [[RES4]] -; - %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res3 = add <8 x i16> %res0, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> -; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[TMP2]] -; - %1 = trunc <8 x i64> %x0 to <8 x i32> - ret <8 x i32> %1 -} - -define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]] -; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[TMP11]] -; - %1 = trunc <8 x i64> %x0 to <8 x i32> - %2 = bitcast i8 %x2 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 - ret <8 x i32> %3 -} - -define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer -; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[TMP10]] -; - %1 = trunc <8 x i64> %x0 to <8 x i32> - %2 = bitcast i8 %x2 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer - ret <8 x i32> %3 -} - -declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovs_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - ret <8 x i32> %res -} - -define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - ret <8 x i32> %res -} - -define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - ret <8 x i32> %res -} - -declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pmovus_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - ret <8 x i32> %res -} - -define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - ret <8 x i32> %res -} - -define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_qd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]]) -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - ret <8 x i32> %res -} - -declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8) - -define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1) - call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2) - ret void -} - -declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) - -define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i8> [[RES4]] -; - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 -} - -declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16) - -define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) - call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) - ret void -} - -declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) - -define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i8> [[RES4]] -; - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16) - -define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) - call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) - ret void -} - -declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) - -define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i8> [[RES4]] -; - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16) - -define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) - call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) - ret void -} - -declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) - -define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i16> [[RES4]] -; - %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res3 = add <16 x i16> %res0, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 -} - -declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16) - -define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) - call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) - ret void -} - -declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) - -define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i16> [[RES4]] -; - %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res3 = add <16 x i16> %res0, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16) - -define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) - call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) - ret void -} - -declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) - -define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]]) -; CHECK-NEXT: [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i16> [[RES4]] -; - %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res3 = add <16 x i16> %res0, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 -} - -declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16) - -define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_mem_512( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1) -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1) - call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2) - ret void -} - -declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32) - -define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer -; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %cvt = sitofp <16 x i32> %x0 to <16 x float> - %1 = bitcast i16 %x2 to <16 x i1> - %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1 - %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8) - %res2 = fadd <16 x float> %2, %3 - ret <16 x float> %res2 -} - -declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32) - -define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2dq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES2]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 -} - -declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32) - -define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0:%.*]], <8 x float> [[X1:%.*]], i8 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0]], <8 x float> [[X1]], i8 -1, i32 10) -; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x float> [[RES2]] -; - %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 -} - -declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32) - -define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2udq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 10) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES2]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32) - -define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2dq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 -} - -declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32) - -define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0]], <8 x double> [[X1]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32) - -define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2udq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 -} - -declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32) - -define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2dq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES2]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 -} - -declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32) - -define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer -; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %cvt = uitofp <16 x i32> %x0 to <16 x float> - %1 = bitcast i16 %x2 to <16 x i1> - %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1 - %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8) - %res2 = fadd <16 x float> %2, %3 - ret <16 x float> %res2 -} - -declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32) - -define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2udq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i32> [[RES2]] -; - %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32) - -define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2dq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32) - -define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2udq_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 -} - -declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - -define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 { -; CHECK-LABEL: @test_getexp_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8) - ret <4 x float> %res -} - -define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_getexp_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] -; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 8) -; CHECK-NEXT: [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES_1]] -; - %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) - %res.1 = fadd <4 x float> %res0, %res1 - ret <4 x float> %res.1 -} - -define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_getexp_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8) - ret <4 x float> %res -} - -declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone - -define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_getexp_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4) - ret <2 x double> %res -} - -define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_mask_getexp_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]] -; CHECK-NEXT: br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 8) -; CHECK-NEXT: [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES_1]] -; - %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) - %res.1 = fadd <2 x double> %res0, %res1 - ret <2 x double> %res.1 -} - -define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_maskz_getexp_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8) - ret <2 x double> %res -} - -declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32) - -define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 5, i8 [[X3:%.*]], i32 8) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[RES4]] -; - %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) - ret i8 %res4 -} - -define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd_all( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 2, i8 -1, i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 3, i8 -1, i32 8) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]] -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 4, i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0 -; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]] -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]] -; CHECK-NEXT: br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] -; CHECK: 18: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 19: -; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 5, i8 [[X3]], i32 8) -; CHECK-NEXT: [[TMP20:%.*]] = xor i8 [[RES1]], -1 -; CHECK-NEXT: [[TMP21:%.*]] = xor i8 [[RES2]], -1 -; CHECK-NEXT: [[TMP22:%.*]] = and i8 [[TMP20]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = and i8 0, [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = or i8 0, [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = or i8 [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[RES11:%.*]] = or i8 [[RES1]], [[RES2]] -; CHECK-NEXT: [[TMP26:%.*]] = xor i8 [[RES3]], -1 -; CHECK-NEXT: [[TMP27:%.*]] = xor i8 [[RES4]], -1 -; CHECK-NEXT: [[TMP28:%.*]] = and i8 [[TMP26]], 0 -; CHECK-NEXT: [[TMP29:%.*]] = and i8 0, [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = or i8 0, [[TMP28]] -; CHECK-NEXT: [[TMP31:%.*]] = or i8 [[TMP30]], [[TMP29]] -; CHECK-NEXT: [[RES12:%.*]] = or i8 [[RES3]], [[RES4]] -; CHECK-NEXT: [[TMP32:%.*]] = xor i8 [[RES11]], -1 -; CHECK-NEXT: [[TMP33:%.*]] = xor i8 [[RES12]], -1 -; CHECK-NEXT: [[TMP34:%.*]] = and i8 [[TMP25]], [[TMP31]] -; CHECK-NEXT: [[TMP35:%.*]] = and i8 [[TMP32]], [[TMP31]] -; CHECK-NEXT: [[TMP36:%.*]] = and i8 [[TMP25]], [[TMP33]] -; CHECK-NEXT: [[TMP37:%.*]] = or i8 [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP38:%.*]] = or i8 [[TMP37]], [[TMP36]] -; CHECK-NEXT: [[RES13:%.*]] = or i8 [[RES11]], [[RES12]] -; CHECK-NEXT: store i8 [[TMP38]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[RES13]] -; - %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) - %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8) - %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4) - %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) - - %res11 = or i8 %res1, %res2 - %res12 = or i8 %res3, %res4 - %res13 = or i8 %res11, %res12 - ret i8 %res13 -} - -declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32) - -define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 3, i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[RES2]] -; - %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) - ret i8 %res2 -} - - -define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss_all( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 2, i8 -1, i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 3, i8 -1, i32 8) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0 -; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]] -; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] -; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 4, i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0 -; CHECK-NEXT: [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]] -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]] -; CHECK-NEXT: br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]] -; CHECK: 18: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 19: -; CHECK-NEXT: [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 5, i8 [[X3]], i32 8) -; CHECK-NEXT: [[TMP20:%.*]] = and i8 [[RES1]], 0 -; CHECK-NEXT: [[TMP21:%.*]] = and i8 0, [[RES2]] -; CHECK-NEXT: [[TMP22:%.*]] = or i8 0, [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or i8 [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[RES11:%.*]] = and i8 [[RES1]], [[RES2]] -; CHECK-NEXT: [[TMP24:%.*]] = and i8 [[RES3]], 0 -; CHECK-NEXT: [[TMP25:%.*]] = and i8 0, [[RES4]] -; CHECK-NEXT: [[TMP26:%.*]] = or i8 0, [[TMP24]] -; CHECK-NEXT: [[TMP27:%.*]] = or i8 [[TMP26]], [[TMP25]] -; CHECK-NEXT: [[RES12:%.*]] = and i8 [[RES3]], [[RES4]] -; CHECK-NEXT: [[TMP28:%.*]] = and i8 [[TMP23]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = and i8 [[RES11]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = and i8 [[TMP23]], [[RES12]] -; CHECK-NEXT: [[TMP31:%.*]] = or i8 [[TMP28]], [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = or i8 [[TMP31]], [[TMP30]] -; CHECK-NEXT: [[RES13:%.*]] = and i8 [[RES11]], [[RES12]] -; CHECK-NEXT: store i8 [[TMP32]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i8 [[RES13]] -; - %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) - %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8) - %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4) - %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8) - - %res11 = and i8 %res1, %res2 - %res12 = and i8 %res3, %res4 - %res13 = and i8 %res11, %res12 - ret i8 %res13 -} - -declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) - -define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0:%.*]], i32 11, <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0]], i32 11, <8 x double> [[X2]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES2]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 -} - -declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32) - -define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0:%.*]], i32 11, <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] -; CHECK: 10: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 11: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0]], i32 11, <16 x float> [[X2]], i16 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 -} - -declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 11, <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 12, <2 x double> zeroinitializer, i8 [[X3]], i32 4) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] -; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 13, <2 x double> [[X2]], i8 [[X3]], i32 8) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0 -; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP21:%.*]] = icmp ne i128 [[TMP21]], 0 -; CHECK-NEXT: [[_MSOR22:%.*]] = or i1 [[_MSOR20]], [[_MSCMP21]] -; CHECK-NEXT: br i1 [[_MSOR22]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] -; CHECK: 22: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 23: -; CHECK-NEXT: [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 14, <2 x double> [[X2]], i8 -1, i32 4) -; CHECK-NEXT: [[RES11:%.*]] = fadd <2 x double> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES12:%.*]] = fadd <2 x double> [[RES2]], [[RES3]] -; CHECK-NEXT: [[RES13:%.*]] = fadd <2 x double> [[RES11]], [[RES12]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES13]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4) - %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 13, <2 x double> %x2, i8 %x3, i32 8) - %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 14, <2 x double> %x2, i8 -1, i32 4) - %res11 = fadd <2 x double> %res, %res1 - %res12 = fadd <2 x double> %res2, %res3 - %res13 = fadd <2 x double> %res11, %res12 - ret <2 x double> %res13 -} - -declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 11, <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 12, <4 x float> zeroinitializer, i8 [[X3]], i32 4) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 13, <4 x float> [[X2]], i8 -1, i32 8) -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i128 [[TMP19]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i128 [[TMP20]], 0 -; CHECK-NEXT: [[_MSOR18:%.*]] = or i1 [[_MSCMP16]], [[_MSCMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i128 [[TMP21]], 0 -; CHECK-NEXT: [[_MSOR20:%.*]] = or i1 [[_MSOR18]], [[_MSCMP19]] -; CHECK-NEXT: br i1 [[_MSOR20]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]] -; CHECK: 22: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 23: -; CHECK-NEXT: [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 14, <4 x float> [[X2]], i8 -1, i32 4) -; CHECK-NEXT: [[RES11:%.*]] = fadd <4 x float> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES12:%.*]] = fadd <4 x float> [[RES2]], [[RES3]] -; CHECK-NEXT: [[RES13:%.*]] = fadd <4 x float> [[RES11]], [[RES12]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES13]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 12, <4 x float> zeroinitializer, i8 %x3, i32 4) - %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 13, <4 x float> %x2, i8 -1, i32 8) - %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 14, <4 x float> %x2, i8 -1, i32 4) - %res11 = fadd <4 x float> %res, %res1 - %res12 = fadd <4 x float> %res2, %res3 - %res13 = fadd <4 x float> %res11, %res12 - ret <4 x float> %res13 -} - -define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[X1:%.*]] = load <4 x float>, ptr [[X1P:%.*]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[X1P]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 -; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1]], i32 11, <4 x float> undef, i8 -1, i32 4) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES]] -; - %x1 = load <4 x float>, ptr %x1p - %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4) - ret <4 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) - -define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) - ret <8 x double> %res -} - -define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES2]] -; - %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2 - ret <8 x double> %res2 -} - -define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES2]] -; - %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer - ret <8 x double> %res2 -} - -declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2 - ret <16 x float> %res2 -} - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer - ret <16 x float> %res2 -} - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) - ret <16 x float> %res -} - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2 - ret <16 x float> %res2 -} - -define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 { -; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> ) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES2]] -; - %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> ) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer - ret <16 x float> %res2 -} - -declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ss2sd_round( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0]], <4 x float> [[X1]], <2 x double> [[X2]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES2]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 -} - -declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_sd2ss_round( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0:%.*]], <2 x double> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 11) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 13: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 14: -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0]], <2 x double> [[X1]], <4 x float> [[X2]], i8 -1, i32 8) -; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES2]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) - -define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP9]] -; - %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) - ret <16 x i32> %1 -} - -define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) - %2 = bitcast i16 %x4 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - ret <16 x i32> %3 -} - -define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP17]] -; - %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) - %2 = bitcast i16 %x4 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - ret <16 x i32> %3 -} - -declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) - -define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 { -; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP9]] -; - %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) - ret <8 x i64> %1 -} - -define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) - %2 = bitcast i8 %x4 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 - ret <8 x i64> %3 -} - -define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP17]] -; - %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) - %2 = bitcast i8 %x4 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - ret <8 x i64> %3 -} - -define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_comi_sd_eq_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 8) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) - ret i32 %res -} - -define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 8) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) - ret i32 %res -} - -define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_comi_sd_eq( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 4) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) - ret i32 %res -} - -define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 4) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) - ret i32 %res -} - -define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_comi_sd_lt_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 8) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) - ret i32 %res -} - -define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt_sae( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 8) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) - ret i32 %res -} - -define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_comi_sd_lt( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 4) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) - ret i32 %res -} - -define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 4) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) - ret i32 %res -} - -declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) - -define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_ucomi_ss_lt( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i32 9, i32 4) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[RES]] -; - %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) - ret i32 %res -} - -declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) - -declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) - -define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP7]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - ret <8 x double> %1 -} - -define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64> -; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP18]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2 - ret <8 x double> %3 -} - -define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[TMP16]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) - -define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - ret <8 x i64> %1 -} - -define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP12]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - ret <8 x i64> %3 -} - -define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[TMP11]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - %2 = bitcast i8 %x3 to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - ret <8 x i64> %3 -} - -declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) - -define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP7]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - ret <16 x float> %1 -} - -define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP18]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2 - ret <16 x float> %3 -} - -define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP16]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - ret <16 x float> %3 -} - -declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) - -define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) #0 { -; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - ret <16 x i32> %1 -} - -define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]] -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP12]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - ret <16 x i32> %3 -} - -define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[TMP11]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - %2 = bitcast i16 %x3 to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - ret <16 x i32> %3 -} - -declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) - -define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 4, i8 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> [[X1]], <8 x i64> [[X2]], i32 5, i8 [[X4]], i32 4) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 3, i8 -1, i32 8) -; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]] -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES4]] -; - %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) - %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 -} - -define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X2:%.*]] = load <8 x i64>, ptr [[X2PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[_MSLD]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2]], i32 3, i8 -1, i32 4) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %x2 = load <8 x i64>, ptr %x2ptr - %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4) - ret <8 x double> %res -} - -declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) - -define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_pd_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 3, i8 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 4) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 2, i8 -1, i32 8) -; CHECK-NEXT: [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]] -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES4]] -; - %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) - %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) - %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 -} - -declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32) - -define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 4) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 -1, i32 8) -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) - %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res3, %res2 - ret <4 x float> %res4 -} - -declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32) - -define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 8) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 6, i8 -1, i32 4) -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8) - %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 6, i8 -1, i32 4) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res3, %res2 - ret <4 x float> %res4 -} - -declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32) - -define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 5, i16 [[X4]], i32 4) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 5, i16 -1, i32 8) -; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES4]] -; - %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) - %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 -} - -define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512_load( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2PTR:%.*]], align 64 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2]], i32 5, i16 -1, i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %x2 = load <16 x i32>, ptr %x2ptr - %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4) - ret <16 x float> %res -} - -declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32) - -define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ps_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 6, i16 [[X4]], i32 8) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 7, i16 -1, i32 4) -; CHECK-NEXT: [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]] -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES4]] -; - %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) - %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 6, i16 %x4, i32 8) - %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 7, i16 -1, i32 4) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 -} - -declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32) - -define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 6, i8 -1, i32 4) -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) - %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 6, i8 -1, i32 4) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 -} - -declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32) - -define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] -; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4) -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0 -; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] -; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]] -; CHECK-NEXT: br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] -; CHECK: 12: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 13: -; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0 -; CHECK-NEXT: [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0 -; CHECK-NEXT: [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]] -; CHECK-NEXT: [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]] -; CHECK-NEXT: br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]] -; CHECK: 17: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 18: -; CHECK-NEXT: [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]], i32 8) -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) - %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 -} - -declare double @llvm.fma.f64(double, double, double) #1 -declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0 - -define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP5]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP27]] to i64 -; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %1 = extractelement <2 x double> %x0, i64 0 - %2 = extractelement <2 x double> %x1, i64 0 - %3 = extractelement <2 x double> %x2, i64 0 - %4 = call double @llvm.fma.f64(double %1, double %2, double %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, double %4, double %1 - %8 = insertelement <2 x double> %x0, double %7, i64 0 - %9 = extractelement <2 x double> %x0, i64 0 - %10 = extractelement <2 x double> %x1, i64 0 - %11 = extractelement <2 x double> %x2, i64 0 - %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11) - %13 = insertelement <2 x double> %x0, double %12, i64 0 - %14 = extractelement <2 x double> %x0, i64 0 - %15 = extractelement <2 x double> %x1, i64 0 - %16 = extractelement <2 x double> %x2, i64 0 - %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10) - %18 = bitcast i8 %x3 to <8 x i1> - %19 = extractelement <8 x i1> %18, i64 0 - %20 = select i1 %19, double %17, double %14 - %21 = insertelement <2 x double> %x0, double %20, i64 0 - %res3 = fadd <2 x double> %8, %13 - %res4 = fadd <2 x double> %21, %res3 - ret <2 x double> %res4 -} - -define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP5]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP27]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %1 = extractelement <4 x float> %x0, i64 0 - %2 = extractelement <4 x float> %x1, i64 0 - %3 = extractelement <4 x float> %x2, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float %1 - %8 = insertelement <4 x float> %x0, float %7, i64 0 - %9 = extractelement <4 x float> %x0, i64 0 - %10 = extractelement <4 x float> %x1, i64 0 - %11 = extractelement <4 x float> %x2, i64 0 - %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11) - %13 = insertelement <4 x float> %x0, float %12, i64 0 - %14 = extractelement <4 x float> %x0, i64 0 - %15 = extractelement <4 x float> %x1, i64 0 - %16 = extractelement <4 x float> %x2, i64 0 - %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10) - %18 = bitcast i8 %x3 to <8 x i1> - %19 = extractelement <8 x i1> %18, i64 0 - %20 = select i1 %19, float %17, float %14 - %21 = insertelement <4 x float> %x0, float %20, i64 0 - %res3 = fadd <4 x float> %8, %13 - %res4 = fadd <4 x float> %21, %res3 - ret <4 x float> %res4 -} - -define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd( -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 0, i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP4]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP10]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i64 [[TMP11]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> [[X0]], double [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]], i32 11) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 0, i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast double [[TMP17]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select i1 false, i64 [[TMP24]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], double [[TMP17]], double 0.000000e+00 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0 -; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[TMP13]], [[TMP26]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES2]] -; - %1 = extractelement <2 x double> %x0, i64 0 - %2 = extractelement <2 x double> %x1, i64 0 - %3 = extractelement <2 x double> %x2, i64 0 - %4 = call double @llvm.fma.f64(double %1, double %2, double %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, double %4, double 0.000000e+00 - %8 = insertelement <2 x double> %x0, double %7, i64 0 - %9 = extractelement <2 x double> %x0, i64 0 - %10 = extractelement <2 x double> %x1, i64 0 - %11 = extractelement <2 x double> %x2, i64 0 - %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11) - %13 = bitcast i8 %x3 to <8 x i1> - %14 = extractelement <8 x i1> %13, i64 0 - %15 = select i1 %14, double %12, double 0.000000e+00 - %16 = insertelement <2 x double> %x0, double %15, i64 0 - %res2 = fadd <2 x double> %8, %16 - ret <2 x double> %res2 -} - -declare float @llvm.fma.f32(float, float, float) #1 -declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0 - -define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss( -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float [[TMP4]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP10]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP11]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[X0]], float [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]], i32 11) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP17]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select i1 false, i32 [[TMP24]], i32 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP17]], float 0.000000e+00 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0 -; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[TMP13]], [[TMP26]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES2]] -; - %1 = extractelement <4 x float> %x0, i64 0 - %2 = extractelement <4 x float> %x1, i64 0 - %3 = extractelement <4 x float> %x2, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float 0.000000e+00 - %8 = insertelement <4 x float> %x0, float %7, i64 0 - %9 = extractelement <4 x float> %x0, i64 0 - %10 = extractelement <4 x float> %x1, i64 0 - %11 = extractelement <4 x float> %x2, i64 0 - %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11) - %13 = bitcast i8 %x3 to <8 x i1> - %14 = extractelement <8 x i1> %13, i64 0 - %15 = select i1 %14, float %12, float 0.000000e+00 - %16 = insertelement <4 x float> %x0, float %15, i64 0 - %res2 = fadd <4 x float> %8, %16 - ret <4 x float> %res2 -} - -define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_load0( -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[TMP1:%.*]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP1]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080 -; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[_MSLD]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], [[TMP6]] -; CHECK-NEXT: [[_MSPROP2:%.*]] = or i32 [[_MSPROP1]], [[TMP7]] -; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.fma.f32(float [[TMP15]], float [[TMP2:%.*]], float [[TMP3:%.*]]) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8 [[TMP8]] to <8 x i1> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP0:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <8 x i1> [[TMP17]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[_MSPROP2]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast float [[TMP16]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = xor i32 [[TMP21]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP3]], i32 [[TMP24]], i32 [[TMP20]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP16]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <4 x i32> [[_MSLD]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP25]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP26]] -; - %5 = load <4 x float>, ptr %1, align 16 - %6 = extractelement <4 x float> %5, i64 0 - %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2 - %8 = bitcast i8 %0 to <8 x i1> - %9 = extractelement <8 x i1> %8, i64 0 - %10 = select i1 %9, float %7, float 0.000000e+00 - %11 = insertelement <4 x float> %5, float %10, i64 0 - ret <4 x float> %11 -} - -define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double [[TMP8]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double [[TMP7]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast double [[TMP32]] to i64 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast double [[TMP29]] to i64 -; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i64 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %1 = extractelement <2 x double> %x0, i64 0 - %2 = extractelement <2 x double> %x1, i64 0 - %3 = extractelement <2 x double> %x2, i64 0 - %4 = call double @llvm.fma.f64(double %1, double %2, double %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, double %4, double %3 - %8 = insertelement <2 x double> %x2, double %7, i64 0 - %9 = extractelement <2 x double> %x0, i64 0 - %10 = extractelement <2 x double> %x1, i64 0 - %11 = extractelement <2 x double> %x2, i64 0 - %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11) - %13 = insertelement <2 x double> %x2, double %12, i64 0 - %14 = extractelement <2 x double> %x0, i64 0 - %15 = extractelement <2 x double> %x1, i64 0 - %16 = extractelement <2 x double> %x2, i64 0 - %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10) - %18 = bitcast i8 %x3 to <8 x i1> - %19 = extractelement <8 x i1> %18, i64 0 - %20 = select i1 %19, double %17, double %16 - %21 = insertelement <2 x double> %x2, double %20, i64 0 - %res3 = fadd <2 x double> %8, %13 - %res4 = fadd <2 x double> %21, %res3 - ret <2 x double> %res4 -} - -define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float [[TMP8]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP7]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]] -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0 -; CHECK-NEXT: [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]] -; CHECK-NEXT: [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]] -; CHECK-NEXT: br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]] -; CHECK: 23: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 24: -; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11) -; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0 -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]] -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]] -; CHECK-NEXT: br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]] -; CHECK: 30: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 31: -; CHECK-NEXT: [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10) -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast float [[TMP32]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = bitcast float [[TMP29]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = or i32 [[TMP39]], 0 -; CHECK-NEXT: [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]] -; CHECK-NEXT: [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]] -; CHECK-NEXT: [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0 -; CHECK-NEXT: [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]] -; CHECK-NEXT: [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %1 = extractelement <4 x float> %x0, i64 0 - %2 = extractelement <4 x float> %x1, i64 0 - %3 = extractelement <4 x float> %x2, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float %3 - %8 = insertelement <4 x float> %x2, float %7, i64 0 - %9 = extractelement <4 x float> %x0, i64 0 - %10 = extractelement <4 x float> %x1, i64 0 - %11 = extractelement <4 x float> %x2, i64 0 - %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11) - %13 = insertelement <4 x float> %x2, float %12, i64 0 - %14 = extractelement <4 x float> %x0, i64 0 - %15 = extractelement <4 x float> %x1, i64 0 - %16 = extractelement <4 x float> %x2, i64 0 - %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10) - %18 = bitcast i8 %x3 to <8 x i1> - %19 = extractelement <8 x i1> %18, i64 0 - %20 = select i1 %19, float %17, float %16 - %21 = insertelement <4 x float> %x2, float %20, i64 0 - %res3 = fadd <4 x float> %8, %13 - %res4 = fadd <4 x float> %21, %res3 - ret <4 x float> %res4 -} - -define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { -; CHECK-LABEL: @fmadd_ss_mask_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 -; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 -; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 -; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] -; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast float [[TMP14]] to i32 -; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]] -; CHECK-NEXT: [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]] -; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0 -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] -; CHECK: 29: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 30: -; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 -; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr -; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP33]], align 4 -; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 -; CHECK-NEXT: ret void -; - %a.val = load float, ptr %a - %av0 = insertelement <4 x float> undef, float %a.val, i32 0 - %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 - %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 - %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 - - %b.val = load float, ptr %b - %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 - %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 - %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 - %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 - %1 = extractelement <4 x float> %av, i64 0 - %2 = extractelement <4 x float> %bv, i64 0 - %3 = extractelement <4 x float> %av, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %c to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float %1 - %8 = insertelement <4 x float> %av, float %7, i64 0 - %sr = extractelement <4 x float> %8, i32 0 - store float %sr, ptr %a - ret void -} - -define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { -; CHECK-LABEL: @fmadd_ss_maskz_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1 -; CHECK-NEXT: [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2 -; CHECK-NEXT: [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3 -; CHECK-NEXT: [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1 -; CHECK-NEXT: [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2 -; CHECK-NEXT: [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2 -; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3 -; CHECK-NEXT: [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]] -; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast float [[TMP17]] to i32 -; CHECK-NEXT: [[TMP23:%.*]] = xor i32 [[TMP22]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]] -; CHECK-NEXT: [[TMP25:%.*]] = or i32 [[TMP24]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0 -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0 -; CHECK-NEXT: [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 -; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr -; CHECK-NEXT: store i32 [[_MSPROP16]], ptr [[TMP32]], align 4 -; CHECK-NEXT: store float [[SR]], ptr [[A]], align 4 -; CHECK-NEXT: ret void -; - %a.val = load float, ptr %a - %av0 = insertelement <4 x float> undef, float %a.val, i32 0 - %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 - %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 - %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 - - %b.val = load float, ptr %b - %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 - %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 - %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 - %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 - %1 = extractelement <4 x float> %av, i64 0 - %2 = extractelement <4 x float> %bv, i64 0 - %3 = extractelement <4 x float> %av, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %c to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float 0.000000e+00 - %8 = insertelement <4 x float> %av, float %7, i64 0 - %sr = extractelement <4 x float> %8, i32 0 - store float %sr, ptr %a - ret void -} - -define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 { -; CHECK-LABEL: @fmadd_sd_mask_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 -; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] -; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] -; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast double [[TMP14]] to i64 -; CHECK-NEXT: [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]] -; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]] -; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]] -; CHECK: 29: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 30: -; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080 -; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr -; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP33]], align 8 -; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 -; CHECK-NEXT: ret void -; - %a.val = load double, ptr %a - %av0 = insertelement <2 x double> undef, double %a.val, i32 0 - %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 - - %b.val = load double, ptr %b - %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 - %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 - %1 = extractelement <2 x double> %av, i64 0 - %2 = extractelement <2 x double> %bv, i64 0 - %3 = extractelement <2 x double> %av, i64 0 - %4 = call double @llvm.fma.f64(double %1, double %2, double %3) - %5 = bitcast i8 %c to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, double %4, double %1 - %8 = insertelement <2 x double> %av, double %7, i64 0 - %sr = extractelement <2 x double> %8, i32 0 - store double %sr, ptr %a - ret void -} - -define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 { -; CHECK-LABEL: @fmadd_sd_maskz_memfold( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0 -; CHECK-NEXT: [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1 -; CHECK-NEXT: [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr -; CHECK-NEXT: [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0 -; CHECK-NEXT: [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1 -; CHECK-NEXT: [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1 -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0 -; CHECK-NEXT: [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0 -; CHECK-NEXT: [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]] -; CHECK-NEXT: [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]] -; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast double [[TMP17]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = xor i64 [[TMP22]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]] -; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[TMP24]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00 -; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0 -; CHECK-NEXT: [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0 -; CHECK-NEXT: [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080 -; CHECK-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr -; CHECK-NEXT: store i64 [[_MSPROP12]], ptr [[TMP32]], align 8 -; CHECK-NEXT: store double [[SR]], ptr [[A]], align 8 -; CHECK-NEXT: ret void -; - %a.val = load double, ptr %a - %av0 = insertelement <2 x double> undef, double %a.val, i32 0 - %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 - - %b.val = load double, ptr %b - %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 - %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 - %1 = extractelement <2 x double> %av, i64 0 - %2 = extractelement <2 x double> %bv, i64 0 - %3 = extractelement <2 x double> %av, i64 0 - %4 = call double @llvm.fma.f64(double %1, double %2, double %3) - %5 = bitcast i8 %c to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, double %4, double 0.000000e+00 - %8 = insertelement <2 x double> %av, double %7, i64 0 - %sr = extractelement <2 x double> %8, i32 0 - store double %sr, ptr %a - ret void -} - -define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double [[TMP9]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] -; CHECK: 26: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 27: -; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] -; CHECK: 35: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 36: -; CHECK-NEXT: [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]] -; CHECK-NEXT: [[TMP43:%.*]] = bitcast double [[TMP37]] to i64 -; CHECK-NEXT: [[TMP44:%.*]] = bitcast double [[TMP38]] to i64 -; CHECK-NEXT: [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]] -; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %1 = fneg <2 x double> %x2 - %2 = extractelement <2 x double> %x0, i64 0 - %3 = extractelement <2 x double> %x1, i64 0 - %4 = extractelement <2 x double> %1, i64 0 - %5 = call double @llvm.fma.f64(double %2, double %3, double %4) - %6 = extractelement <2 x double> %x2, i64 0 - %7 = bitcast i8 %x3 to <8 x i1> - %8 = extractelement <8 x i1> %7, i64 0 - %9 = select i1 %8, double %5, double %6 - %10 = insertelement <2 x double> %x2, double %9, i64 0 - %11 = fneg <2 x double> %x2 - %12 = extractelement <2 x double> %x0, i64 0 - %13 = extractelement <2 x double> %x1, i64 0 - %14 = extractelement <2 x double> %11, i64 0 - %15 = call double @llvm.x86.avx512.vfmadd.f64(double %12, double %13, double %14, i32 11) - %16 = extractelement <2 x double> %x2, i64 0 - %17 = insertelement <2 x double> %x2, double %15, i64 0 - %18 = fneg <2 x double> %x2 - %19 = extractelement <2 x double> %x0, i64 0 - %20 = extractelement <2 x double> %x1, i64 0 - %21 = extractelement <2 x double> %18, i64 0 - %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 10) - %23 = extractelement <2 x double> %x2, i64 0 - %24 = bitcast i8 %x3 to <8 x i1> - %25 = extractelement <8 x i1> %24, i64 0 - %26 = select i1 %25, double %22, double %23 - %27 = insertelement <2 x double> %x2, double %26, i64 0 - %res3 = fadd <2 x double> %10, %17 - %res4 = fadd <2 x double> %27, %res3 - ret <2 x double> %res4 -} - -define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float [[TMP9]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]] -; CHECK: 26: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 27: -; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]] -; CHECK: 35: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 36: -; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]] -; CHECK-NEXT: [[TMP43:%.*]] = bitcast float [[TMP37]] to i32 -; CHECK-NEXT: [[TMP44:%.*]] = bitcast float [[TMP38]] to i32 -; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = or i32 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]] -; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %1 = fneg <4 x float> %x2 - %2 = extractelement <4 x float> %x0, i64 0 - %3 = extractelement <4 x float> %x1, i64 0 - %4 = extractelement <4 x float> %1, i64 0 - %5 = call float @llvm.fma.f32(float %2, float %3, float %4) - %6 = extractelement <4 x float> %x2, i64 0 - %7 = bitcast i8 %x3 to <8 x i1> - %8 = extractelement <8 x i1> %7, i64 0 - %9 = select i1 %8, float %5, float %6 - %10 = insertelement <4 x float> %x2, float %9, i64 0 - %11 = fneg <4 x float> %x2 - %12 = extractelement <4 x float> %x0, i64 0 - %13 = extractelement <4 x float> %x1, i64 0 - %14 = extractelement <4 x float> %11, i64 0 - %15 = call float @llvm.x86.avx512.vfmadd.f32(float %12, float %13, float %14, i32 11) - %16 = extractelement <4 x float> %x2, i64 0 - %17 = insertelement <4 x float> %x2, float %15, i64 0 - %18 = fneg <4 x float> %x2 - %19 = extractelement <4 x float> %x0, i64 0 - %20 = extractelement <4 x float> %x1, i64 0 - %21 = extractelement <4 x float> %18, i64 0 - %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 10) - %23 = extractelement <4 x float> %x2, i64 0 - %24 = bitcast i8 %x3 to <8 x i1> - %25 = extractelement <8 x i1> %24, i64 0 - %26 = select i1 %25, float %22, float %23 - %27 = insertelement <4 x float> %x2, float %26, i64 0 - %res3 = fadd <4 x float> %10, %17 - %res4 = fadd <4 x float> %27, %res3 - ret <4 x float> %res4 -} - -define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double [[TMP10]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast double [[TMP11]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = fneg <2 x double> [[X0]] -; CHECK-NEXT: [[TMP24:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = fneg <2 x double> [[X0]] -; CHECK-NEXT: [[TMP34:%.*]] = fneg <2 x double> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] -; CHECK: 38: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 39: -; CHECK-NEXT: [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]] -; CHECK-NEXT: [[TMP46:%.*]] = bitcast double [[TMP40]] to i64 -; CHECK-NEXT: [[TMP47:%.*]] = bitcast double [[TMP41]] to i64 -; CHECK-NEXT: [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = or i64 [[TMP48]], 0 -; CHECK-NEXT: [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]] -; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]] -; CHECK-NEXT: store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <2 x double> [[RES4]] -; - %1 = fneg <2 x double> %x0 - %2 = fneg <2 x double> %x2 - %3 = extractelement <2 x double> %1, i64 0 - %4 = extractelement <2 x double> %x1, i64 0 - %5 = extractelement <2 x double> %2, i64 0 - %6 = call double @llvm.fma.f64(double %3, double %4, double %5) - %7 = extractelement <2 x double> %x2, i64 0 - %8 = bitcast i8 %x3 to <8 x i1> - %9 = extractelement <8 x i1> %8, i64 0 - %10 = select i1 %9, double %6, double %7 - %11 = insertelement <2 x double> %x2, double %10, i64 0 - %12 = fneg <2 x double> %x0 - %13 = fneg <2 x double> %x2 - %14 = extractelement <2 x double> %12, i64 0 - %15 = extractelement <2 x double> %x1, i64 0 - %16 = extractelement <2 x double> %13, i64 0 - %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11) - %18 = extractelement <2 x double> %x2, i64 0 - %19 = insertelement <2 x double> %x2, double %17, i64 0 - %20 = fneg <2 x double> %x0 - %21 = fneg <2 x double> %x2 - %22 = extractelement <2 x double> %20, i64 0 - %23 = extractelement <2 x double> %x1, i64 0 - %24 = extractelement <2 x double> %21, i64 0 - %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 10) - %26 = extractelement <2 x double> %x2, i64 0 - %27 = bitcast i8 %x3 to <8 x i1> - %28 = extractelement <8 x i1> %27, i64 0 - %29 = select i1 %28, double %25, double %26 - %30 = insertelement <2 x double> %x2, double %29, i64 0 - %res3 = fadd <2 x double> %11, %19 - %res4 = fadd <2 x double> %30, %res3 - ret <2 x double> %res4 -} - -define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]] -; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]] -; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]]) -; CHECK-NEXT: [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast float [[TMP10]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP11]] to i32 -; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]] -; CHECK-NEXT: [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = fneg <4 x float> [[X0]] -; CHECK-NEXT: [[TMP24:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0 -; CHECK-NEXT: [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0 -; CHECK-NEXT: [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]] -; CHECK-NEXT: [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0 -; CHECK-NEXT: [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]] -; CHECK-NEXT: br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]] -; CHECK: 28: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 29: -; CHECK-NEXT: [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11) -; CHECK-NEXT: [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = fneg <4 x float> [[X0]] -; CHECK-NEXT: [[TMP34:%.*]] = fneg <4 x float> [[X2]] -; CHECK-NEXT: [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0 -; CHECK-NEXT: [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0 -; CHECK-NEXT: [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0 -; CHECK-NEXT: [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0 -; CHECK-NEXT: [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0 -; CHECK-NEXT: [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]] -; CHECK-NEXT: [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0 -; CHECK-NEXT: [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]] -; CHECK-NEXT: br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]] -; CHECK: 38: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 39: -; CHECK-NEXT: [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10) -; CHECK-NEXT: [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0 -; CHECK-NEXT: [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0 -; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]] -; CHECK-NEXT: [[TMP46:%.*]] = bitcast float [[TMP40]] to i32 -; CHECK-NEXT: [[TMP47:%.*]] = bitcast float [[TMP41]] to i32 -; CHECK-NEXT: [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], 0 -; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]] -; CHECK-NEXT: [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]] -; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]] -; CHECK-NEXT: [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0 -; CHECK-NEXT: [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]] -; CHECK-NEXT: [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]] -; CHECK-NEXT: [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]] -; CHECK-NEXT: [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]] -; CHECK-NEXT: store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[RES4]] -; - %1 = fneg <4 x float> %x0 - %2 = fneg <4 x float> %x2 - %3 = extractelement <4 x float> %1, i64 0 - %4 = extractelement <4 x float> %x1, i64 0 - %5 = extractelement <4 x float> %2, i64 0 - %6 = call float @llvm.fma.f32(float %3, float %4, float %5) - %7 = extractelement <4 x float> %x2, i64 0 - %8 = bitcast i8 %x3 to <8 x i1> - %9 = extractelement <8 x i1> %8, i64 0 - %10 = select i1 %9, float %6, float %7 - %11 = insertelement <4 x float> %x2, float %10, i64 0 - %12 = fneg <4 x float> %x0 - %13 = fneg <4 x float> %x2 - %14 = extractelement <4 x float> %12, i64 0 - %15 = extractelement <4 x float> %x1, i64 0 - %16 = extractelement <4 x float> %13, i64 0 - %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11) - %18 = extractelement <4 x float> %x2, i64 0 - %19 = insertelement <4 x float> %x2, float %17, i64 0 - %20 = fneg <4 x float> %x0 - %21 = fneg <4 x float> %x2 - %22 = extractelement <4 x float> %20, i64 0 - %23 = extractelement <4 x float> %x1, i64 0 - %24 = extractelement <4 x float> %21, i64 0 - %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 10) - %26 = extractelement <4 x float> %x2, i64 0 - %27 = bitcast i8 %x3 to <8 x i1> - %28 = extractelement <8 x i1> %27, i64 0 - %29 = select i1 %28, float %25, float %26 - %30 = insertelement <4 x float> %x2, float %29, i64 0 - %res3 = fadd <4 x float> %11, %19 - %res4 = fadd <4 x float> %30, %res3 - ret <4 x float> %res4 -} - -define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] -; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP12]] to i32 -; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] -; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP24]] -; - %q = load float, ptr %ptr_b - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %1 = extractelement <4 x float> %x0, i64 0 - %2 = extractelement <4 x float> %vecinit.i, i64 0 - %3 = extractelement <4 x float> %x1, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float %3 - %8 = insertelement <4 x float> %x1, float %7, i64 0 - ret <4 x float> %8 -} - -define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080 -; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] -; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1> -; CHECK-NEXT: [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0 -; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast float [[TMP13]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[TMP10]] to i32 -; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]] -; CHECK-NEXT: [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]] -; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP24]] -; - %q = load float, ptr %ptr_b - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %1 = extractelement <4 x float> %x0, i64 0 - %2 = extractelement <4 x float> %vecinit.i, i64 0 - %3 = extractelement <4 x float> %x1, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = extractelement <8 x i1> %5, i64 0 - %7 = select i1 %6, float %4, float %1 - %8 = insertelement <4 x float> %x0, float %7, i64 0 - ret <4 x float> %8 -} - - -define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 { -; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm( -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 -; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0 -; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0 -; CHECK-NEXT: [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0 -; CHECK-NEXT: [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]] -; CHECK-NEXT: [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]] -; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float [[TMP12]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = xor i32 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]] -; CHECK-NEXT: [[TMP17:%.*]] = or i32 [[TMP16]], 0 -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00 -; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0 -; CHECK-NEXT: store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <4 x float> [[TMP19]] -; - %q = load float, ptr %ptr_b - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %1 = extractelement <4 x float> %x0, i64 0 - %2 = extractelement <4 x float> %x1, i64 0 - %3 = extractelement <4 x float> %vecinit.i, i64 0 - %4 = call float @llvm.fma.f32(float %1, float %2, float %3) - %5 = select i1 false, float %4, float 0.000000e+00 - %6 = insertelement <4 x float> %x0, float %5, i64 0 - ret <4 x float> %6 -} - -define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psll_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psll_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru - ret <16 x i32> %res2 -} -define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psll_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} -declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone - - -define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psll_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psll_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru - ret <8 x i64> %res2 -} -define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psll_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} -declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone - - -define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_pslli_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_pslli_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru - ret <16 x i32> %res2 -} -define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} -declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone - - -define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_pslli_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_pslli_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru - ret <8 x i64> %res2 -} -define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} -declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone - - -define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psra_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psra_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru - ret <8 x i64> %res2 -} -define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psra_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} -declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone - - -define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psra_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psra_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru - ret <16 x i32> %res2 -} -define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psra_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} -declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone - - - -define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrai_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrai_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru - ret <8 x i64> %res2 -} -define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} -declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone - - -define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrai_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrai_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru - ret <16 x i32> %res2 -} -define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} -declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone - - - -define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrl_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrl_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru - ret <16 x i32> %res2 -} -define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} -declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone - - -define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrl_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i1 [[TMP5]] to i512 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrl_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = sext i1 [[TMP7]] to i512 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru - ret <8 x i64> %res2 -} -define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = sext i1 [[TMP6]] to i512 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]]) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} -declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone - - -define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrli_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - ret <16 x i32> %res -} -define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrli_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru - ret <16 x i32> %res2 -} -define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1] - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} -declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone - - -define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) #0 { -; CHECK-LABEL: @test_x86_avx512_psrli_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - ret <8 x i64> %res -} -define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrli_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru - ret <8 x i64> %res2 -} -define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1] - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} -declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone - -define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psllv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_psllv_d_512_const() #0 { -; CHECK-LABEL: @test_x86_avx512_psllv_d_512_const( -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) -; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) - %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) - %res2 = add <16 x i32> %res0, %res1 - ret <16 x i32> %res2 -} - -define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psllv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 - ret <16 x i32> %res2 -} - -define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone - -define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psllv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_psllv_q_512_const() #0 { -; CHECK-LABEL: @test_x86_avx512_psllv_q_512_const( -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) - %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) - %res2 = add <8 x i64> %res0, %res1 - ret <8 x i64> %res2 -} - -define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psllv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 - ret <8 x i64> %res2 -} - -define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} - -declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone - -define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrav_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrav_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 - ret <16 x i32> %res2 -} - -define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrav_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrav_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 - ret <8 x i64> %res2 -} - -define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} - -declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone - -define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrlv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) - ret <16 x i32> %res -} - -define <16 x i32> @test_x86_avx512_psrlv_d_512_const() #0 { -; CHECK-LABEL: @test_x86_avx512_psrlv_d_512_const( -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) -; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> ) -; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) - %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) - %res2 = add <16 x i32> %res0, %res1 - ret <16 x i32> %res2 -} - -define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2 - ret <16 x i32> %res2 -} - -define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x i32> [[RES2]] -; - %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) - %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer - ret <16 x i32> %res2 -} - -declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone - -define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 { -; CHECK-LABEL: @test_x86_avx512_psrlv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) - ret <8 x i64> %res -} - -define <8 x i64> @test_x86_avx512_psrlv_q_512_const() #0 { -; CHECK-LABEL: @test_x86_avx512_psrlv_q_512_const( -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> ) -; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) - %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) - %res2 = add <8 x i64> %res0, %res1 - ret <8 x i64> %res2 -} - -define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]] -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2 - ret <8 x i64> %res2 -} - -define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 { -; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q_512( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]]) -; CHECK-NEXT: [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]] -; CHECK-NEXT: [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer -; CHECK-NEXT: store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES2]] -; - %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) - %mask.cast = bitcast i8 %mask to <8 x i1> - %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer - ret <8 x i64> %res2 -} - -declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone - - -define <8 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind #0 { -; CHECK-LABEL: @test_mm256_castpd128_pd256_freeze( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[A1:%.*]] = freeze <2 x double> poison -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> [[A1]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %a1 = freeze <2 x double> poison - %res = shufflevector <2 x double> %a0, <2 x double> %a1, <8 x i32> - ret <8 x double> %res -} - - -define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwind #0 { -; CHECK-LABEL: @test_mm256_castpd256_pd256_freeze( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[A1:%.*]] = freeze <4 x double> poison -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> [[A1]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x double> [[RES]] -; - %a1 = freeze <4 x double> poison - %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> - ret <8 x double> %res -} - - -define <16 x float> @test_mm256_castps128_ps512_freeze(<4 x float> %a0) nounwind #0 { -; CHECK-LABEL: @test_mm256_castps128_ps512_freeze( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[A1:%.*]] = freeze <4 x float> poison -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <16 x i32> -; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %a1 = freeze <4 x float> poison - %res = shufflevector <4 x float> %a0, <4 x float> %a1, <16x i32> - ret <16 x float> %res -} - - -define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind #0 { -; CHECK-LABEL: @test_mm256_castps256_ps512_freeze( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[A1:%.*]] = freeze <8 x float> poison -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> zeroinitializer, <16 x i32> -; CHECK-NEXT: [[RES:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> [[A1]], <16 x i32> -; CHECK-NEXT: store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[RES]] -; - %a1 = freeze <8 x float> poison - %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> - ret <16 x float> %res -} - - -define <8 x i64> @test_mm512_castsi128_si512_freeze(<2 x i64> %a0) nounwind #0 { -; CHECK-LABEL: @test_mm512_castsi128_si512_freeze( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[A1:%.*]] = freeze <2 x i64> poison -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %a1 = freeze <2 x i64> poison - %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <8 x i32> - ret <8 x i64> %res -} - - -define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwind #0 { -; CHECK-LABEL: @test_mm512_castsi256_si512_pd256_freeze( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[A1:%.*]] = freeze <4 x i64> poison -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> [[A1]], <8 x i32> -; CHECK-NEXT: store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <8 x i64> [[RES]] -; - %a1 = freeze <4 x i64> poison - %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> - ret <8 x i64> %res -} - - -define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 { -; CHECK-LABEL: @bad_mask_transition( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i64> [[TMP0]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 9: -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i1> [[TMP10]] to i8 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i64> [[TMP3]] to i512 -; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP13]], 0 -; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] -; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] -; CHECK: 14: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[C:%.*]], <8 x double> [[D:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i1> [[TMP16]] to i8 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i16 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP17]] to i16 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16 [[CONV]] to <16 x i1> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16 [[CONV2]] to <16 x i1> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP18]], <16 x i1> undef, <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i1> [[TMP19]], <16 x i1> undef, <8 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = xor <16 x i32> [[TMP24]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = or <16 x i32> [[TMP26]], [[TMP4]] -; CHECK-NEXT: [[TMP28:%.*]] = or <16 x i32> [[TMP27]], [[TMP5]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP28]], <16 x i32> [[TMP23]] -; CHECK-NEXT: [[TMP29:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[F]], <16 x float> [[E]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP29]] -; -entry: - %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> , i32 4) - %1 = bitcast <8 x i1> %0 to i8 - %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> , i32 4) - %3 = bitcast <8 x i1> %2 to i8 - %conv = zext i8 %1 to i16 - %conv2 = zext i8 %3 to i16 - %4 = bitcast i16 %conv to <16 x i1> - %5 = bitcast i16 %conv2 to <16 x i1> - %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> - %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> - %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> - %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e - ret <16 x float> %9 -} - -define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 { -; CHECK-LABEL: @bad_mask_transition_2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 -; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4) -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i1> [[TMP8]] to i8 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP9]] to i16 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[CONV]] to <16 x i1> -; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP2]] -; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP16]], <16 x i32> [[TMP11]] -; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[F]], <16 x float> [[E]] -; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <16 x float> [[TMP17]] -; -entry: - %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> , i32 4) - %1 = bitcast <8 x i1> %0 to i8 - %conv = zext i8 %1 to i16 - %2 = bitcast i16 %conv to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e - ret <16 x float> %3 -} - -declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>) -declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>) -declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>) -declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>) -declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>) -declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>) -declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>) -declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>) - -attributes #0 = { sanitize_memory } From 7107f55d82f8d1077d5478e8f58c94851385c06f Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Mon, 27 Jan 2025 00:25:19 -0300 Subject: [PATCH 158/432] [clang] NFC: remove redundant dyn_cast --- clang/lib/AST/TemplateBase.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp index 3625b6e435a55..0eef8f305fcb3 100644 --- a/clang/lib/AST/TemplateBase.cpp +++ b/clang/lib/AST/TemplateBase.cpp @@ -515,19 +515,17 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out, } case Declaration: { - NamedDecl *ND = getAsDecl(); + ValueDecl *VD = getAsDecl(); if (getParamTypeForDecl()->isRecordType()) { - if (auto *TPO = dyn_cast(ND)) { + if (auto *TPO = dyn_cast(VD)) { TPO->getType().getUnqualifiedType().print(Out, Policy); TPO->printAsInit(Out, Policy); break; } } - if (auto *VD = dyn_cast(ND)) { - if (needsAmpersandOnTemplateArg(getParamTypeForDecl(), VD->getType())) - Out << "&"; - } - ND->printQualifiedName(Out); + if (needsAmpersandOnTemplateArg(getParamTypeForDecl(), VD->getType())) + Out << "&"; + VD->printQualifiedName(Out); break; } From 0e6b58202ca9c4d1ca814e4bea5bd3f0bac7f329 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 20:20:51 -0800 Subject: [PATCH 159/432] [ELF] Improve parseSymbolVersion tests in for compileBitcodeFiles Otherwise, the tests won't catch a mistake that removes `parseSymbolVersion`. --- lld/test/ELF/lto/version-script.ll | 34 ++++------------------------- lld/test/ELF/lto/version-script2.ll | 8 +++---- 2 files changed, 8 insertions(+), 34 deletions(-) diff --git a/lld/test/ELF/lto/version-script.ll b/lld/test/ELF/lto/version-script.ll index 52b9afc38eeed..54a5e01f2dee5 100644 --- a/lld/test/ELF/lto/version-script.ll +++ b/lld/test/ELF/lto/version-script.ll @@ -3,7 +3,7 @@ ; RUN: echo "VERSION_1.0{ global: foo; local: *; }; VERSION_2.0{ global: bar; local: *; };" > %t.script ; RUN: ld.lld %t.o -o %t2 -shared --version-script %t.script -save-temps ; RUN: llvm-dis < %t2.0.0.preopt.bc | FileCheck %s -; RUN: llvm-readobj --dyn-syms %t2 | FileCheck --check-prefix=DSO %s +; RUN: llvm-readelf --dyn-syms %t2 | FileCheck --check-prefix=DSO %s target triple = "x86_64-unknown-linux-gnu" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -19,32 +19,6 @@ define void @bar() { ; CHECK: define void @foo() ; CHECK: define void @bar() -; DSO: DynamicSymbols [ -; DSO: Symbol { -; DSO: Name: -; DSO: Value: 0x0 -; DSO: Size: 0 -; DSO: Binding: Local -; DSO: Type: None -; DSO: Other: 0 -; DSO: Section: Undefined -; DSO: } -; DSO: Symbol { -; DSO: Name: foo@@VERSION_1.0 -; DSO: Value: -; DSO: Size: 1 -; DSO: Binding: Global -; DSO: Type: Function -; DSO: Other: 0 -; DSO: Section: .text -; DSO: } -; DSO: Symbol { -; DSO: Name: bar@@VERSION_2.0 -; DSO: Value: -; DSO: Size: 1 -; DSO: Binding: Global -; DSO: Type: Function -; DSO: Other: 0 -; DSO: Section: .text -; DSO: } -; DSO: ] +; DSO: Symbol table '.dynsym' contains 3 entries: +; DSO: 1: {{.*}} 1 FUNC GLOBAL DEFAULT [[#]] foo@@VERSION_1.0{{$}} +; DSO: 2: {{.*}} 1 FUNC GLOBAL DEFAULT [[#]] bar@@VERSION_2.0{{$}} diff --git a/lld/test/ELF/lto/version-script2.ll b/lld/test/ELF/lto/version-script2.ll index dab22750f77b8..5635731518fdb 100644 --- a/lld/test/ELF/lto/version-script2.ll +++ b/lld/test/ELF/lto/version-script2.ll @@ -17,16 +17,16 @@ ; RUN: ld.lld %t.o %tbar.so -o %t.so -shared --version-script %t/ver ; RUN: llvm-readelf --dyn-syms %t.so | FileCheck %s -; CHECK: UND bar@VER1 -; CHECK-NEXT: {{[1-9]}} foo@@VER1 +; CHECK: UND bar@VER1{{$}} +; CHECK-NEXT: {{[1-9]}} foo@@VER1{{$}} ;; For relocatable output, @ should be retained in the symbol name. ;; Don't parse and drop `@VER1`. Also check that --version-script is ignored. ; RUN: ld.lld %t.o -o %t.ro -r --version-script %t/ver ; RUN: llvm-readelf -s %t.ro | FileCheck %s --check-prefix=RELOCATABLE -; RELOCATABLE: {{[1-9]}} foo@@VER1 -; RELOCATABLE-NEXT: UND bar@VER1 +; RELOCATABLE: {{[1-9]}} foo@@VER1{{$}} +; RELOCATABLE-NEXT: UND bar@VER1{{$}} ;--- ver VER1 {}; From 2a26292388fcab0c857c91b2d08074c33abd37e8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 20:32:42 -0800 Subject: [PATCH 160/432] [ELF] Make isExported accurate early LTO compilation might define symbols not in the symbol table (e.g. __emutls_v.x in test/ELF/lto/wrap-unreferenced-before-codegen.test). These symbols have a false `isExported` until `demoteSymbolsAndComputeIsPreemptible`. This is usually benign as we do not reference `isExported` that early. Ensure that `isExported` is correct early. This helps remove a redundant `isExported` computation in `demoteSymbolsAndComputeIsPreemptible`. --- lld/ELF/Driver.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 7e0d3fca31353..06c93710497ed 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -2157,9 +2157,12 @@ static void excludeLibs(Ctx &ctx, opt::InputArgList &args) { ArrayRef symbols = file->getSymbols(); if (isa(file)) symbols = cast(file)->getGlobalSymbols(); - for (Symbol *sym : symbols) - if (!sym->isUndefined() && sym->file == file) + for (Symbol *sym : symbols) { + if (!sym->isUndefined() && sym->file == file) { sym->versionId = VER_NDX_LOCAL; + sym->isExported = false; + } + } }; for (ELFFileBase *file : ctx.objectFiles) @@ -2545,11 +2548,17 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) { auto *obj = cast>(file.get()); obj->parse(/*ignoreComdats=*/true); - // Parse '@' in symbol names for non-relocatable output. + // For defined symbols in non-relocatable output, + // compute isExported and parse '@'. if (!ctx.arg.relocatable) - for (Symbol *sym : obj->getGlobalSymbols()) + for (Symbol *sym : obj->getGlobalSymbols()) { + if (!sym->isDefined()) + continue; + if (sym->includeInDynsym(ctx)) + sym->isExported = true; if (sym->hasVersionSuffix) sym->parseSymbolVersion(ctx); + } ctx.objectFiles.push_back(obj); } } @@ -3061,7 +3070,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // Handle --exclude-libs again because lto.tmp may reference additional // libcalls symbols defined in an excluded archive. This may override - // versionId set by scanVersionScript(). + // versionId set by scanVersionScript() and isExported. if (args.hasArg(OPT_exclude_libs)) excludeLibs(ctx, args); From 1a4d6de1b532149b10522eae5dabce39e5f7c687 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 20:43:03 -0800 Subject: [PATCH 161/432] [ELF] Remove redundant isExported computation Commit 2a26292388fcab0c857c91b2d08074c33abd37e8 made `isExported` accurate except a few linker-synthesized symbols in finalizeSections. We can collect these linker-synthesized symbols into a vector and avoid recomputation for other symbols. --- lld/ELF/Config.h | 1 + lld/ELF/Writer.cpp | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index b2859486d58e9..b1d7fb88553e2 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -619,6 +619,7 @@ struct Ctx : CommonLinkerContext { }; ElfSym sym{}; std::unique_ptr symtab; + SmallVector synthesizedSymbols; SmallVector> memoryBuffers; SmallVector objectFiles; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index b7c4790655e8a..4b75137a5db21 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -149,6 +149,7 @@ static Defined *addOptionalRegular(Ctx &ctx, StringRef name, SectionBase *sec, if (!s || s->isDefined() || s->isCommon()) return nullptr; + ctx.synthesizedSymbols.push_back(s); s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL, stOther, STT_NOTYPE, val, /*size=*/0, sec}); @@ -282,6 +283,7 @@ static void demoteDefined(Defined &sym, DenseMap &map) { static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) { llvm::TimeTraceScope timeScope("Demote symbols"); DenseMap> sectionIndexMap; + bool hasDynSymTab = ctx.arg.hasDynSymTab; for (Symbol *sym : ctx.symtab->getSymbols()) { if (auto *d = dyn_cast(sym)) { if (d->section && !d->section->isLive()) @@ -294,11 +296,12 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) { sym->type) .overwrite(*sym); sym->versionId = VER_NDX_GLOBAL; + if (sym->includeInDynsym(ctx)) + sym->isExported = true; } } - sym->isExported = sym->includeInDynsym(ctx); - if (ctx.arg.hasDynSymTab) + if (hasDynSymTab) sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym); } } @@ -1846,6 +1849,11 @@ template void Writer::finalizeSections() { } } + // If the previous code block defines any non-hidden symbols (e.g. + // __global_pointer$), they may be exported. + for (Symbol *sym : ctx.synthesizedSymbols) + sym->isExported = sym->includeInDynsym(ctx); + demoteSymbolsAndComputeIsPreemptible(ctx); if (ctx.arg.copyRelocs && ctx.arg.discard != DiscardPolicy::None) From b9efbed468ec18044070eea936c694fb8f6e244b Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 21:14:49 -0800 Subject: [PATCH 162/432] Revert "Move HIP fatbin sections farther away from .text" This reverts commit 048f35037779763963c4b4478a0884e828ea9538. This reverts commit f7bbc40b0736cc417f57cd039b098b504cf6a71f. Related to #95949. A developer with no prior lld contribution and very little AMD contribution sneaked in these application-specific section order rules we discourage. --- lld/ELF/Writer.cpp | 10 -------- lld/test/ELF/hip-section-layout.s | 41 ------------------------------- 2 files changed, 51 deletions(-) delete mode 100644 lld/test/ELF/hip-section-layout.s diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 4b75137a5db21..6b34f87f0b8d0 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -650,7 +650,6 @@ static bool isRelroSection(Ctx &ctx, const OutputSection *sec) { enum RankFlags { RF_NOT_ADDR_SET = 1 << 27, RF_NOT_ALLOC = 1 << 26, - RF_HIP_FATBIN = 1 << 19, RF_PARTITION = 1 << 18, // Partition number (8 bits) RF_LARGE_ALT = 1 << 15, RF_WRITE = 1 << 14, @@ -748,15 +747,6 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) { if (osec.type == SHT_NOBITS) rank |= RF_BSS; - // Put HIP fatbin related sections further away to avoid wasting relocation - // range to jump over them. Make sure .hip_fatbin is the furthest. - if (osec.name == ".hipFatBinSegment") - rank |= RF_HIP_FATBIN; - if (osec.name == ".hip_gpubin_handle") - rank |= RF_HIP_FATBIN | 2; - if (osec.name == ".hip_fatbin") - rank |= RF_HIP_FATBIN | RF_WRITE | 3; - // Some architectures have additional ordering restrictions for sections // within the same PT_LOAD. if (ctx.arg.emachine == EM_PPC64) { diff --git a/lld/test/ELF/hip-section-layout.s b/lld/test/ELF/hip-section-layout.s deleted file mode 100644 index b76141c6b41ae..0000000000000 --- a/lld/test/ELF/hip-section-layout.s +++ /dev/null @@ -1,41 +0,0 @@ -# REQUIRES: x86 -## Test HIP specific sections layout. - -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=HIP_SECTIONS=1 --defsym=NON_HIP_SECTIONS=1 %s -o %t.o -# RUN: ld.lld %t.o -o %t.out -# RUN: llvm-readobj --sections %t.out | FileCheck %s - -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=NON_HIP_SECTIONS=1 %s -o %t.1.o -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=HIP_SECTIONS=1 %s -o %t.2.o -# RUN: ld.lld %t.1.o %t.2.o -o %t.1.s.out -# RUN: llvm-readobj --sections %t.1.s.out | FileCheck %s -# RUN: ld.lld %t.2.o %t.1.o -o %t.2.s.out -# RUN: llvm-readobj --sections %t.2.s.out | FileCheck %s - -.ifdef HIP_SECTIONS -.section .hipFatBinSegment,"aw",@progbits; .space 1 -.section .hip_gpubin_handle,"aw",@progbits; .space 1 -.section .hip_fatbin,"a",@progbits; .space 1 -.endif - -.ifdef NON_HIP_SECTIONS -.global _start -.text -_start: -.section .bss,"aw",@nobits; .space 1 -.section .debug_info,"",@progbits -.section .debug_line,"",@progbits -.section .debug_str,"MS",@progbits,1 -.endif - -# Check that the HIP sections are placed towards the end but before non allocated sections - -// CHECK: Name: .text -// CHECK: Name: .bss -// CHECK: Name: .hipFatBinSegment -// CHECK: Name: .hip_gpubin_handle -// CHECK: Name: .hip_fatbin -// CHECK: Name: .debug_info -// CHECK: Name: .debug_line -// CHECK: Name: .debug_str - From 6805d7e8aa5f2ecea021acbb8c6b4c29ca432e78 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jan 2025 22:28:31 -0800 Subject: [PATCH 163/432] [test] Convert remove-note.test from \r\n to \n after #118739 --- .../tools/llvm-objcopy/ELF/remove-note.test | 396 +++++++++--------- 1 file changed, 198 insertions(+), 198 deletions(-) diff --git a/llvm/test/tools/llvm-objcopy/ELF/remove-note.test b/llvm/test/tools/llvm-objcopy/ELF/remove-note.test index f8936bf9ea731..e15f934dfe2da 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/remove-note.test +++ b/llvm/test/tools/llvm-objcopy/ELF/remove-note.test @@ -1,198 +1,198 @@ -## Check incompatible options. -# RUN: not llvm-objcopy --remove-note=1 --remove-section=.test - 2>&1 | FileCheck %s --check-prefix=ERR-REMSEC -# RUN: not llvm-objcopy --remove-note=1 --add-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-ADDSEC -# RUN: not llvm-objcopy --remove-note=1 --update-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-UPDSEC - -# ERR-REMSEC: error: cannot specify both --remove-note and --remove-section -# ERR-ADDSEC: error: cannot specify both --remove-note and --add-section -# ERR-UPDSEC: error: cannot specify both --remove-note and --update-section - -## Check invalid argument formats. -# RUN: not llvm-objcopy --remove-note= - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID -# RUN: not llvm-objcopy --remove-note=CORE/ - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID -# RUN: not llvm-objcopy --remove-note=/1 - 2>&1 | FileCheck %s --check-prefix=ERR-EMPTYNAME -# RUN: not llvm-objcopy --remove-note=CORE/1/2 - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM1 -# RUN: not llvm-objcopy --remove-note=Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2 -# RUN: not llvm-objcopy --remove-note=CORE/Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2 - -# ERR-NOTYPEID: error: bad format for --remove-note, missing type_id -# ERR-EMPTYNAME: error: bad format for --remove-note, note name is empty -# ERR-INVNUM1: error: bad note type_id for --remove-note: '1/2' -# ERR-INVNUM2: error: bad note type_id for --remove-note: 'Notanumber' - -## Check deleting notes: -## * --remove-note=1 will remove note "CORE/1" and "LINUX/1", -## * --remove-note=DUMMY/2 will not remove any notes because there are no notes with this owner, -## * --remove-note=CORE/3 will remove "CORE/3" but preserve "LINUX/3". -# RUN: yaml2obj --docnum=1 -D ALIGN=8 -D ELFCLASS=64 -D ENDIANNESS=LSB %s -o %t8.64.lsb -# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/2 --remove-note=CORE/0x03 %t8.64.lsb %t8.64.lsb.o -# RUN: llvm-readobj --segments --sections --notes %t8.64.lsb.o | \ -# RUN: FileCheck %s -D#SIZE0=32 -D#SIZE1=64 - -# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=64 -D ENDIANNESS=MSB %s -o %t4.64.msb -# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.64.msb %t4.64.msb.o -# RUN: llvm-readobj --segments --sections --notes %t4.64.msb.o | \ -# RUN: FileCheck %s -D#SIZE0=24 -D#SIZE1=48 - -# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=32 -D ENDIANNESS=LSB %s -o %t4.32.lsb -# RUN: llvm-objcopy --remove-note=1 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.32.lsb %t4.32.lsb.o -# RUN: llvm-readobj --segments --sections --notes %t4.32.lsb.o | \ -# RUN: FileCheck %s -D#SIZE0=24 -D#SIZE1=48 - -# CHECK: Sections [ -# CHECK: Section { -# CHECK: Name: .note0 -# CHECK-NEXT: Type: SHT_NOTE -# CHECK-NEXT: Flags [ -# CHECK-NEXT: ] -# CHECK-NEXT: Address: -# CHECK-NEXT: Offset: -# CHECK-NEXT: Size: [[#%d,SIZE0]] -# CHECK: Name: .note1 -# CHECK-NEXT: Type: SHT_NOTE -# CHECK-NEXT: Flags [ -# CHECK-NEXT: ] -# CHECK-NEXT: Address: -# CHECK-NEXT: Offset: -# CHECK-NEXT: Size: [[#%d,SIZE1]] -# CHECK: Name: .note2 -# CHECK-NEXT: Type: SHT_NOTE -# CHECK-NEXT: Flags [ -# CHECK-NEXT: ] -# CHECK-NEXT: Address: -# CHECK-NEXT: Offset: -# CHECK-NEXT: Size: 0 - -# CHECK: NoteSections [ -# CHECK-NEXT: NoteSection { -# CHECK-NEXT: Name: .note0 -# CHECK-NEXT: Offset: -# CHECK-NEXT: Size: 0x[[#%X,SIZE0]] -# CHECK-NEXT: Notes [ -# CHECK-NEXT: { -# CHECK-NEXT: Owner: CORE -# CHECK-NEXT: Data size: 0x2 -# CHECK-NEXT: Type: NT_ARCH -# CHECK-NEXT: Description data ( -# CHECK-NEXT: 0000: 0201 -# CHECK-NEXT: ) -# CHECK-NEXT: } -# CHECK-NEXT: ] -# CHECK-NEXT: } -# CHECK-NEXT: NoteSection { -# CHECK-NEXT: Name: .note1 -# CHECK-NEXT: Offset: -# CHECK-NEXT: Size: 0x[[#%X,SIZE1]] -# CHECK-NEXT: Notes [ -# CHECK-NEXT: { -# CHECK-NEXT: Owner: LINUX -# CHECK-NEXT: Data size: 0x2 -# CHECK-NEXT: Type: Unknown (0x00000003) -# CHECK-NEXT: Description data ( -# CHECK-NEXT: 0000: 0301 -# CHECK-NEXT: ) -# CHECK-NEXT: } -# CHECK-NEXT: { -# CHECK-NEXT: Owner: CORE -# CHECK-NEXT: Data size: 0x2 -# CHECK-NEXT: Type: Unknown (0x00000004) -# CHECK-NEXT: Description data ( -# CHECK-NEXT: 0000: 0401 -# CHECK-NEXT: ) -# CHECK-NEXT: } -# CHECK-NEXT: ] -# CHECK-NEXT: } -# CHECK-NEXT: NoteSection { -# CHECK-NEXT: Name: .note2 -# CHECK-NEXT: Offset: -# CHECK-NEXT: Size: 0x0 -# CHECK-NEXT: Notes [ -# CHECK-NEXT: ] -# CHECK-NEXT: } - ---- !ELF -FileHeader: - Class: ELFCLASS[[ELFCLASS]] - Data: ELFDATA2[[ENDIANNESS]] - Type: ET_REL - Machine: EM_X86_64 -Sections: - - Name: .note0 - Type: SHT_NOTE - AddressAlign: [[ALIGN]] - Notes: - - Name: CORE - Type: 0x01 - Desc: 0101 - - Name: CORE - Type: 0x02 - Desc: 0201 - - Name: .note1 - Type: SHT_NOTE - AddressAlign: [[ALIGN]] - Notes: - - Name: LINUX - Type: 0x03 - Desc: 0301 - - Name: CORE - Type: 0x03 - Desc: 0302 - - Name: CORE - Type: 0x04 - Desc: 0401 - - Name: .note2 - Type: SHT_NOTE - AddressAlign: [[ALIGN]] - Notes: - - Name: LINUX - Type: 0x01 - Desc: 0102 - -# RUN: yaml2obj --docnum=2 %s -o %t2 -# RUN: llvm-objcopy --remove-note=1 %t2 %t2o 2>&1 | FileCheck %s --check-prefix=TEST2 -# TEST2: warning: note segments are not supported -# TEST2-NOT: note segments are not supported - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_CORE - Machine: EM_X86_64 -ProgramHeaders: - - Type: PT_NOTE - FirstSec: .data0 - LastSec: .data0 - - Type: PT_NOTE - FirstSec: .data1 - LastSec: .data1 -Sections: - - Name: .data0 - Type: Fill - Size: 8 - - Name: .data1 - Type: Fill - Size: 8 - -# RUN: yaml2obj --docnum=3 %s -o %t3 -# RUN: llvm-objcopy --remove-note=1 %t3 %t3o 2>&1 | FileCheck %s --check-prefix=TEST3 -# TEST3: warning: cannot remove note(s) from .note: sections in segments are not supported - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_EXEC - Machine: EM_X86_64 -ProgramHeaders: - - Type: PT_LOAD - FirstSec: .note - LastSec: .note -Sections: - - Name: .note - Type: SHT_NOTE - AddressAlign: 4 - Notes: - - Name: ABC - Type: 1 - Desc: 0102 +## Check incompatible options. +# RUN: not llvm-objcopy --remove-note=1 --remove-section=.test - 2>&1 | FileCheck %s --check-prefix=ERR-REMSEC +# RUN: not llvm-objcopy --remove-note=1 --add-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-ADDSEC +# RUN: not llvm-objcopy --remove-note=1 --update-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-UPDSEC + +# ERR-REMSEC: error: cannot specify both --remove-note and --remove-section +# ERR-ADDSEC: error: cannot specify both --remove-note and --add-section +# ERR-UPDSEC: error: cannot specify both --remove-note and --update-section + +## Check invalid argument formats. +# RUN: not llvm-objcopy --remove-note= - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID +# RUN: not llvm-objcopy --remove-note=CORE/ - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID +# RUN: not llvm-objcopy --remove-note=/1 - 2>&1 | FileCheck %s --check-prefix=ERR-EMPTYNAME +# RUN: not llvm-objcopy --remove-note=CORE/1/2 - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM1 +# RUN: not llvm-objcopy --remove-note=Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2 +# RUN: not llvm-objcopy --remove-note=CORE/Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2 + +# ERR-NOTYPEID: error: bad format for --remove-note, missing type_id +# ERR-EMPTYNAME: error: bad format for --remove-note, note name is empty +# ERR-INVNUM1: error: bad note type_id for --remove-note: '1/2' +# ERR-INVNUM2: error: bad note type_id for --remove-note: 'Notanumber' + +## Check deleting notes: +## * --remove-note=1 will remove note "CORE/1" and "LINUX/1", +## * --remove-note=DUMMY/2 will not remove any notes because there are no notes with this owner, +## * --remove-note=CORE/3 will remove "CORE/3" but preserve "LINUX/3". +# RUN: yaml2obj --docnum=1 -D ALIGN=8 -D ELFCLASS=64 -D ENDIANNESS=LSB %s -o %t8.64.lsb +# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/2 --remove-note=CORE/0x03 %t8.64.lsb %t8.64.lsb.o +# RUN: llvm-readobj --segments --sections --notes %t8.64.lsb.o | \ +# RUN: FileCheck %s -D#SIZE0=32 -D#SIZE1=64 + +# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=64 -D ENDIANNESS=MSB %s -o %t4.64.msb +# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.64.msb %t4.64.msb.o +# RUN: llvm-readobj --segments --sections --notes %t4.64.msb.o | \ +# RUN: FileCheck %s -D#SIZE0=24 -D#SIZE1=48 + +# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=32 -D ENDIANNESS=LSB %s -o %t4.32.lsb +# RUN: llvm-objcopy --remove-note=1 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.32.lsb %t4.32.lsb.o +# RUN: llvm-readobj --segments --sections --notes %t4.32.lsb.o | \ +# RUN: FileCheck %s -D#SIZE0=24 -D#SIZE1=48 + +# CHECK: Sections [ +# CHECK: Section { +# CHECK: Name: .note0 +# CHECK-NEXT: Type: SHT_NOTE +# CHECK-NEXT: Flags [ +# CHECK-NEXT: ] +# CHECK-NEXT: Address: +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: [[#%d,SIZE0]] +# CHECK: Name: .note1 +# CHECK-NEXT: Type: SHT_NOTE +# CHECK-NEXT: Flags [ +# CHECK-NEXT: ] +# CHECK-NEXT: Address: +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: [[#%d,SIZE1]] +# CHECK: Name: .note2 +# CHECK-NEXT: Type: SHT_NOTE +# CHECK-NEXT: Flags [ +# CHECK-NEXT: ] +# CHECK-NEXT: Address: +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: 0 + +# CHECK: NoteSections [ +# CHECK-NEXT: NoteSection { +# CHECK-NEXT: Name: .note0 +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: 0x[[#%X,SIZE0]] +# CHECK-NEXT: Notes [ +# CHECK-NEXT: { +# CHECK-NEXT: Owner: CORE +# CHECK-NEXT: Data size: 0x2 +# CHECK-NEXT: Type: NT_ARCH +# CHECK-NEXT: Description data ( +# CHECK-NEXT: 0000: 0201 +# CHECK-NEXT: ) +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } +# CHECK-NEXT: NoteSection { +# CHECK-NEXT: Name: .note1 +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: 0x[[#%X,SIZE1]] +# CHECK-NEXT: Notes [ +# CHECK-NEXT: { +# CHECK-NEXT: Owner: LINUX +# CHECK-NEXT: Data size: 0x2 +# CHECK-NEXT: Type: Unknown (0x00000003) +# CHECK-NEXT: Description data ( +# CHECK-NEXT: 0000: 0301 +# CHECK-NEXT: ) +# CHECK-NEXT: } +# CHECK-NEXT: { +# CHECK-NEXT: Owner: CORE +# CHECK-NEXT: Data size: 0x2 +# CHECK-NEXT: Type: Unknown (0x00000004) +# CHECK-NEXT: Description data ( +# CHECK-NEXT: 0000: 0401 +# CHECK-NEXT: ) +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } +# CHECK-NEXT: NoteSection { +# CHECK-NEXT: Name: .note2 +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: 0x0 +# CHECK-NEXT: Notes [ +# CHECK-NEXT: ] +# CHECK-NEXT: } + +--- !ELF +FileHeader: + Class: ELFCLASS[[ELFCLASS]] + Data: ELFDATA2[[ENDIANNESS]] + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .note0 + Type: SHT_NOTE + AddressAlign: [[ALIGN]] + Notes: + - Name: CORE + Type: 0x01 + Desc: 0101 + - Name: CORE + Type: 0x02 + Desc: 0201 + - Name: .note1 + Type: SHT_NOTE + AddressAlign: [[ALIGN]] + Notes: + - Name: LINUX + Type: 0x03 + Desc: 0301 + - Name: CORE + Type: 0x03 + Desc: 0302 + - Name: CORE + Type: 0x04 + Desc: 0401 + - Name: .note2 + Type: SHT_NOTE + AddressAlign: [[ALIGN]] + Notes: + - Name: LINUX + Type: 0x01 + Desc: 0102 + +# RUN: yaml2obj --docnum=2 %s -o %t2 +# RUN: llvm-objcopy --remove-note=1 %t2 %t2o 2>&1 | FileCheck %s --check-prefix=TEST2 +# TEST2: warning: note segments are not supported +# TEST2-NOT: note segments are not supported + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_CORE + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_NOTE + FirstSec: .data0 + LastSec: .data0 + - Type: PT_NOTE + FirstSec: .data1 + LastSec: .data1 +Sections: + - Name: .data0 + Type: Fill + Size: 8 + - Name: .data1 + Type: Fill + Size: 8 + +# RUN: yaml2obj --docnum=3 %s -o %t3 +# RUN: llvm-objcopy --remove-note=1 %t3 %t3o 2>&1 | FileCheck %s --check-prefix=TEST3 +# TEST3: warning: cannot remove note(s) from .note: sections in segments are not supported + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + FirstSec: .note + LastSec: .note +Sections: + - Name: .note + Type: SHT_NOTE + AddressAlign: 4 + Notes: + - Name: ABC + Type: 1 + Desc: 0102 From 9452ee4f750a849148a391ac75eb31220343fa1e Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Mon, 27 Jan 2025 00:11:18 -0800 Subject: [PATCH 164/432] [clang-format] Treat uppercase identifiers after struct as macros (#124397) This restores the behavior before llvmorg-20-init. Fixes #94184. Fixes #117477. Fixes #122690. Fixes #123142. --- clang/lib/Format/UnwrappedLineParser.cpp | 3 ++- clang/unittests/Format/TokenAnnotatorTest.cpp | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 4258329136348..906fc11a07d5e 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -4075,7 +4075,8 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) { break; default: if (!JSPastExtendsOrImplements && !ClassName && - Previous->is(tok::identifier) && Previous->isNot(TT_AttributeMacro)) { + Previous->is(tok::identifier) && Previous->isNot(TT_AttributeMacro) && + Previous->TokenText != Previous->TokenText.upper()) { ClassName = Previous; } } diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 10587449dcea9..585878e0edc5b 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -560,9 +560,16 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) { ASSERT_EQ(Tokens.size(), 15u) << Tokens; EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_StructLBrace); + constexpr StringRef Code{"struct EXPORT StructName {};"}; + + Tokens = annotate(Code); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace); + EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_StructRBrace); + auto Style = getLLVMStyle(); Style.AttributeMacros.push_back("EXPORT"); - Tokens = annotate("struct EXPORT StructName {};", Style); + Tokens = annotate(Code, Style); ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[1], tok::identifier, TT_AttributeMacro); EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace); From a01e1d4e044ec0147e04a5af9ca54ede550f5dc1 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Mon, 27 Jan 2025 03:31:56 -0500 Subject: [PATCH 165/432] [clang][Sema] Handle dependent qualifier in HeuristicResolver::resolveDeclRefExpr() (#124515) --- clang/lib/Sema/HeuristicResolver.cpp | 5 +++-- .../unittests/Sema/HeuristicResolverTest.cpp | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp index e893afed71d26..87c7274e7aefa 100644 --- a/clang/lib/Sema/HeuristicResolver.cpp +++ b/clang/lib/Sema/HeuristicResolver.cpp @@ -262,8 +262,9 @@ std::vector HeuristicResolverImpl::resolveMemberExpr( std::vector HeuristicResolverImpl::resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) { - return resolveDependentMember(QualType(RE->getQualifier()->getAsType(), 0), - RE->getDeclName(), StaticFilter); + return resolveDependentMember( + resolveNestedNameSpecifierToType(RE->getQualifier()), RE->getDeclName(), + StaticFilter); } std::vector diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp index 2b775b11719ea..e5cd1254d7542 100644 --- a/clang/unittests/Sema/HeuristicResolverTest.cpp +++ b/clang/unittests/Sema/HeuristicResolverTest.cpp @@ -385,6 +385,27 @@ TEST(HeuristicResolver, DeclRefExpr_RespectScope) { dependentScopeDeclRefExpr(hasDependentName("getPointer")).bind("input")); } +TEST(HeuristicResolver, DeclRefExpr_Nested) { + std::string Code = R"cpp( + struct S { + static int Waldo; + }; + template + struct Meta { + using Type = S; + }; + template + void foo() { + Meta::Type::Waldo; + } + )cpp"; + // Test resolution of "Waldo" in "Meta::Type::Waldo". + expectResolution( + Code, &HeuristicResolver::resolveDeclRefExpr, + dependentScopeDeclRefExpr(hasDependentName("Waldo")).bind("input"), + varDecl(hasName("Waldo")).bind("output")); +} + TEST(HeuristicResolver, DependentNameType) { std::string Code = R"cpp( template From bd38c4993aa41d89a13cbc4dc457df4d81e410bf Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 27 Jan 2025 09:21:56 +0000 Subject: [PATCH 166/432] [AArch64] Generate zeroing forms of certain SVE2.2 instructions (8/11) (#116834) SVE2.2 introduces instructions with predicated forms with zeroing of the inactive lanes. This allows in some cases to save a `movprfx` or a `mov` instruction when emitting code for `_x` or `_z` variants of intrinsics. This patch adds support for emitting the zeroing forms of certain `FRINTx`, `FRECPX`, and `FSQRT` instructions. --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 18 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 9 +- .../zeroing-forms-frint-frecpx-fsqrt.ll | 4656 +++++++++++++++++ 3 files changed, 4673 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 6d5e2697160ab..9ed683e73e9cc 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4301,17 +4301,17 @@ let Predicates = [HasSVE2p2_or_SME2p2] in { defm FRINT64X_ZPzZ : sve_fp_z2op_p_zd_frint<0b11, "frint64x">; // Floating-point round to integral fp value, zeroing predicate - defm FRINTN_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00000, "frintn">; - defm FRINTP_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00001, "frintp">; - defm FRINTM_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00010, "frintm">; - defm FRINTZ_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00011, "frintz">; - defm FRINTA_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00100, "frinta">; - defm FRINTX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00110, "frintx">; - defm FRINTI_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00111, "frinti">; + defm FRINTN_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00000, "frintn", AArch64frintn_mt>; + defm FRINTP_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00001, "frintp", AArch64frintp_mt>; + defm FRINTM_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00010, "frintm", AArch64frintm_mt>; + defm FRINTZ_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00011, "frintz", AArch64frintz_mt>; + defm FRINTA_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00100, "frinta", AArch64frinta_mt>; + defm FRINTX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00110, "frintx", AArch64frintx_mt>; + defm FRINTI_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00111, "frinti", AArch64frinti_mt>; // Floating-point invert exponent, zeroing predicate - defm FRECPX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b01100, "frecpx">; + defm FRECPX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b01100, "frecpx", AArch64frecpx_mt>; // Floating-point square root, zeroing predicate - defm FSQRT_ZPZz : sve_fp_z2op_p_zd_hsd<0b01101, "fsqrt">; + defm FSQRT_ZPZz : sve_fp_z2op_p_zd_hsd<0b01101, "fsqrt", AArch64fsqrt_mt>; // SVE2p2 integer unary arithmetic (bitwise), zeroing predicate defm CLS_ZPzZ : sve_int_un_pred_arit_bitwise_z<0b000, "cls", AArch64cls_mt>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 2ee9910da5079..8125014faa033 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3270,10 +3270,17 @@ multiclass sve_fp_z2op_p_zd { defm : SVE_3_Op_UndefZero_Pat(NAME # _DtoS)>; } -multiclass sve_fp_z2op_p_zd_hsd opc, string asm> { +multiclass sve_fp_z2op_p_zd_hsd opc, string asm, SDPatternOperator op> { def _H : sve_fp_z2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16>; def _S : sve_fp_z2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32>; def _D : sve_fp_z2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64>; + + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _H)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _S)>; + defm : SVE_1_Op_PassthruUndefZero_Pat(NAME # _D)>; } multiclass sve_fp_z2op_p_zd_frint opc, string asm> { diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll new file mode 100644 index 0000000000000..c493ec2dcc95d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll @@ -0,0 +1,4656 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 + +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 + +target triple = "aarch64-linux" + +define @test_svrinta_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinta_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinta_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinta_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinta_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinta_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinta_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinta_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frinta z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinta_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinta_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frinta z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinta_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinta_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrinta_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinta_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frinta z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinta_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinta z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinti_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinti_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinti_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinti_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinti_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinti_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinti_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinti_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frinti z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinti_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinti_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frinti z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrinti_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrinti_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrinti_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f64( poison, %pg, %x) + ret %0 +} + + +define @test_svrinti_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrinti_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frinti z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrinti_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frinti z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + + +define @test_svrintm_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintm_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintm z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv8f16( poison, %pg, %x) + ret %0 +} + + +define @test_svrintm_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintm_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintm_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintm z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintm_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintm_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintm z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintm_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintm_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintm z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintm z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintm_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintm_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintm z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintm z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintm_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintm_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintm z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintm_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintm_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frintm z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintm_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintm z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintn_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintn_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintn z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintn_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintn_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintn z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintn_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintn_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintn z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintn_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintn_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintn z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintn z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintn_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintn_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintn z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintn z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintn_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintn_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintn z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintn_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintn_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frintn z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintn_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintn z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintp_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintp_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintp_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintp_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintp_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintp_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintp_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintp_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintp z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintp_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintp_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintp z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintp_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintp_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintp_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f64( poison, %pg, %x) + ret %0 +} + + +define @test_svrintp_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintp_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frintp z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintp_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintp z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintx_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintx_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintx_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintx_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintx_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintx_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintx_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintx_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintx_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintx_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintx_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintx_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintx_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintx_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frintx z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintx_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintx z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintz_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintz_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintz z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintz_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintz_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintz z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintz_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintz_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintz z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintz_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintz_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintz z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintz z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintz_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintz_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintz z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frintz z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrintz_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrintz_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintz z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrintz_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrintz_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frintz z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrintz_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frintz z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrecpx_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrecpx_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frecpx z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrecpx_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrecpx_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frecpx z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrecpx_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svrecpx_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frecpx z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrecpx_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrecpx_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frecpx z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frecpx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrecpx_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svrecpx_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frecpx z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: frecpx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svrecpx_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svrecpx_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frecpx z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svrecpx_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svrecpx_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: frecpx z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svrecpx_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: frecpx z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svsqrt_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svsqrt_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svsqrt_4f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svsqrt_4f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_4f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_4f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_4f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_4f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_4f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_4f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_4f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svsqrt_2f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svsqrt_2f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_2f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_2f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_2f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_2f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_2f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_2f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_2f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svsqrt_2f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svsqrt_2f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_2f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_2f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_2f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_2f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_2f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_2f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_2f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svsqrt_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svsqrt_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svsqrt_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svsqrt_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svsqrt_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svsqrt_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: fsqrt z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svsqrt_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: fsqrt z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinta_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinta_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinta z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinta_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinta_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinta_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinta z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinta_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinta_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinta_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinta z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinta z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinta_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinta_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinta_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinta z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinta_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinta_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinta_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinta z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinta z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinta_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinta_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinta z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinta z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrinta_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinta_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinta z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinta_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinta z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinta.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinti_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinti_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrinti_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinti_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinti z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinti_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinti_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrinti_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinti_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinti z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinti_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinti_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrinti_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinti_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinti z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinti z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinti_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinti_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrinti_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinti_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinti z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinti_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinti_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrinti_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinti_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinti z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frinti z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrinti_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrinti_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frinti z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinti z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrinti_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrinti_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frinti z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrinti_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frinti z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frinti.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintm_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintm_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintm_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintm_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintm z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintm_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintm_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintm_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintm_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintm z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintm_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintm_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintm_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintm_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintm z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintm z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintm_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintm_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintm_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintm_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintm z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintm_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintm_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintm_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintm_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintm z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintm z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintm_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintm_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintm z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintm z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrintm_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintm_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintm z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintm_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintm z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintm.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintn_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintn_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintn_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintn_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintn z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintn_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintn_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintn_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintn_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintn z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintn_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintn_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintn_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintn_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintn z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintn z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintn_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintn_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintn_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintn_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintn z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintn_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintn_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintn_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintn_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintn z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintn z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintn_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintn_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintn z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintn z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrintn_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintn_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintn z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintn_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintn z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintn.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintp_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintp_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintp_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintp_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintp z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintp_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintp_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintp_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintp_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintp z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintp_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintp_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintp_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintp_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintp z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintp z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintp_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintp_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintp_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintp_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintp z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintp_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintp_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintp_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintp_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintp z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintp z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintp_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintp_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintp z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintp z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrintp_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintp_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintp z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintp_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintp z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintp.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintx_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintx_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintx_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintx_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintx z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintx_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintx_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintx_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintx_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintx z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintx_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintx_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintx_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintx_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintx z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintx z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintx_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintx_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintx_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintx_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintx z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintx_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintx_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintx_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintx_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintx z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintx z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintx_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintx_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintx z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintx z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrintx_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintx_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintx z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintx_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintx z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintx.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintz_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintz_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintz_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintz_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintz z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintz_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintz_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintz_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintz_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintz z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintz_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintz_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrintz_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintz_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintz z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintz z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintz_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintz_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintz_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintz_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintz z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintz_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintz_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrintz_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintz_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintz z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frintz z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrintz_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrintz_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frintz z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintz z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrintz_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrintz_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frintz z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrintz_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frintz z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frintz.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfrecpx_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrecpx_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrecpx_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrecpx_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frecpx z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrecpx_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrecpx_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrecpx_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrecpx_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frecpx z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrecpx_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrecpx_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfrecpx_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrecpx_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frecpx z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfrecpx_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrecpx_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrecpx_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrecpx_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frecpx z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrecpx_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrecpx_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfrecpx_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrecpx_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frecpx z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfrecpx_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfrecpx_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: frecpx z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frecpx z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfrecpx_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfrecpx_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: frecpx z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfrecpx_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: frecpx z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f64( %x, %pg, %y) + ret %0 +} + + +define @test_svfsqrt_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfsqrt_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svfsqrt_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfsqrt_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fsqrt z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv8f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfsqrt_nxv4f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfsqrt_nxv4f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv4f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f16( poison, %pg, %x) + ret %0 +} + +define @test_svfsqrt_nxv4f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfsqrt_nxv4f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fsqrt z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv4f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfsqrt_nxv2f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfsqrt_nxv2f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv2f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f16( poison, %pg, %x) + ret %0 +} + +define @test_svfsqrt_nxv2f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfsqrt_nxv2f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fsqrt z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv2f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f16( %x, %pg, %y) + ret %0 +} + + +define @test_svfsqrt_nxv2f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfsqrt_nxv2f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv2f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f32( poison, %pg, %x) + ret %0 +} + +define @test_svfsqrt_nxv2f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfsqrt_nxv2f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fsqrt z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv2f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfsqrt_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfsqrt_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svfsqrt_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfsqrt_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fsqrt z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( %x, %pg, %y) + ret %0 +} + + +define @test_svfsqrt_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svfsqrt_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fsqrt z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fsqrt z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svfsqrt_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svfsqrt_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fsqrt z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svfsqrt_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: fsqrt z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f64( %x, %pg, %y) + ret %0 +} + From 351ee30529c054d39ea742c1b9c738c9e70c131b Mon Sep 17 00:00:00 2001 From: bernhardu Date: Mon, 27 Jan 2025 10:49:07 +0100 Subject: [PATCH 167/432] [win/asan] GetInstructionSize: Support some more 6 byte instructions. (#124006) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds several instructions seen when trying to run a executable built with ASan with llvm-mingw. (x86 and x86_64, using the git tip in llvm-project). Also includes instructions collected by Roman Pišl and Eric Pouech in the Wine bug reports below. ``` Related: https://github.com/llvm/llvm-project/issues/96270 Co-authored-by: Roman Pišl https://bugs.winehq.org/show_bug.cgi?id=50993 https://bugs.winehq.org/attachment.cgi?id=70233 Co-authored-by: Eric Pouech https://bugs.winehq.org/show_bug.cgi?id=52386 https://bugs.winehq.org/attachment.cgi?id=71626 ``` --- compiler-rt/lib/interception/interception_win.cpp | 5 +++++ compiler-rt/lib/interception/tests/interception_win_test.cpp | 3 +++ 2 files changed, 8 insertions(+) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index 64004c171d534..de6e74edce2d1 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -661,6 +661,10 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { case 0xC1F6: // F6 C1 XX : test cl, XX return 3; + case 0x89FF: // FF 89 XX XX XX XX : dec dword ptr [ecx + XX XX XX XX] + case 0xEC81: // 81 EC XX XX XX XX : sub esp, XX XX XX XX + return 6; + // Cannot overwrite control-instruction. Return 0 to indicate failure. case 0x25FF: // FF 25 XX YY ZZ WW : jmp dword ptr ds:[WWZZYYXX] return 0; @@ -739,6 +743,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { case 0x058B: // 8B 05 XX XX XX XX : mov eax, dword ptr [XX XX XX XX] if (rel_offset) *rel_offset = 2; + case 0xB841: // 41 B8 XX XX XX XX : mov r8d, XX XX XX XX return 6; case 0x7E81: // 81 7E YY XX XX XX XX cmp DWORD PTR [rsi+YY], XX XX XX XX diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp index c5dcf26070f0d..c29968e974742 100644 --- a/compiler-rt/lib/interception/tests/interception_win_test.cpp +++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp @@ -870,6 +870,8 @@ const struct InstructionSizeData { { 5, {0xb8, 0x71, 0x72, 0x73, 0x74}, 0, "b8 XX XX XX XX : mov eax, XX XX XX XX"}, { 5, {0xB9, 0x71, 0x72, 0x73, 0x74}, 0, "b9 XX XX XX XX : mov ecx, XX XX XX XX"}, { 5, {0xBA, 0x71, 0x72, 0x73, 0x74}, 0, "ba XX XX XX XX : mov edx, XX XX XX XX"}, + { 6, {0x81, 0xEC, 0x72, 0x73, 0x74, 0x75}, 0, "81 EC XX XX XX XX : sub esp, XX XX XX XX"}, + { 6, {0xFF, 0x89, 0x72, 0x73, 0x74, 0x75}, 0, "FF 89 XX XX XX XX : dec dword ptr [ecx + XX XX XX XX]"}, { 7, {0x8D, 0xA4, 0x24, 0x73, 0x74, 0x75, 0x76}, 0, "8D A4 24 XX XX XX XX : lea esp, [esp + XX XX XX XX]"}, #if SANITIZER_WINDOWS_x64 // sorted list @@ -1000,6 +1002,7 @@ const struct InstructionSizeData { { 5, {0x66, 0x48, 0x0F, 0x7E, 0xC0}, 0, "66 48 0F 7E C0 : movq rax, xmm0"}, { 5, {0x83, 0x44, 0x72, 0x73, 0x74}, 0, "83 44 72 XX YY : add DWORD PTR [rdx+rsi*2+XX],YY"}, { 5, {0x83, 0x64, 0x24, 0x73, 0x74}, 0, "83 64 24 XX YY : and DWORD PTR [rsp+XX], YY"}, + { 6, {0x41, 0xB8, 0x72, 0x73, 0x74, 0x75}, 0, "41 B8 XX XX XX XX : mov r8d, XX XX XX XX"}, { 6, {0x48, 0x83, 0x64, 0x24, 0x74, 0x75}, 0, "48 83 64 24 XX YY : and QWORD PTR [rsp + XX], YY"}, { 6, {0x66, 0x81, 0x78, 0x73, 0x74, 0x75}, 0, "66 81 78 XX YY YY : cmp WORD PTR [rax+XX], YY YY"}, { 6, {0x66, 0x81, 0x79, 0x73, 0x74, 0x75}, 0, "66 81 79 XX YY YY : cmp WORD PTR [rcx+XX], YY YY"}, From bbf377060adc8607e1187952388c7eeea7cf4933 Mon Sep 17 00:00:00 2001 From: bernhardu Date: Mon, 27 Jan 2025 10:50:54 +0100 Subject: [PATCH 168/432] [win/asan] GetInstructionSize: Support some more 7 or 8 byte instructions. (#124011) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds several instructions seen when trying to run a executable built with ASan with llvm-mingw. (x86 and x86_64, using the git tip in llvm-project). Also includes instructions collected by Roman Pišl and Eric Pouech in the Wine bug reports below. ``` Related: https://github.com/llvm/llvm-project/issues/96270 Co-authored-by: Roman Pišl https://bugs.winehq.org/show_bug.cgi?id=50993 https://bugs.winehq.org/attachment.cgi?id=70233 Co-authored-by: Eric Pouech https://bugs.winehq.org/show_bug.cgi?id=52386 https://bugs.winehq.org/attachment.cgi?id=71626 ``` --- compiler-rt/lib/interception/interception_win.cpp | 3 +++ compiler-rt/lib/interception/tests/interception_win_test.cpp | 2 ++ 2 files changed, 5 insertions(+) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index de6e74edce2d1..002b37468a200 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -857,6 +857,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { return 6; case 0xec8148: // 48 81 EC XX XX XX XX : sub rsp, XXXXXXXX + case 0xc0c748: // 48 C7 C0 XX XX XX XX : mov rax, XX XX XX XX return 7; // clang-format off @@ -918,6 +919,8 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { return 5; case 0x24648348: // 48 83 64 24 XX YY : and QWORD PTR [rsp + XX], YY return 6; + case 0x24A48D48: // 48 8D A4 24 XX XX XX XX : lea rsp, [rsp + XX XX XX XX] + return 8; } switch (0xFFFFFFFFFFULL & *(u64 *)(address)) { diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp index c29968e974742..2a7549d230ae2 100644 --- a/compiler-rt/lib/interception/tests/interception_win_test.cpp +++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp @@ -1022,6 +1022,7 @@ const struct InstructionSizeData { { 7, {0x48, 0x89, 0x15, 0x73, 0x74, 0x75, 0x76}, 3, "48 89 15 XX XX XX XX : mov QWORD PTR [rip + XXXXXXXX], rdx"}, { 7, {0x48, 0x8b, 0x05, 0x73, 0x74, 0x75, 0x76}, 3, "48 8b 05 XX XX XX XX : mov rax, QWORD PTR [rip + XXXXXXXX]"}, { 7, {0x48, 0x8d, 0x05, 0x73, 0x74, 0x75, 0x76}, 3, "48 8d 05 XX XX XX XX : lea rax, QWORD PTR [rip + XXXXXXXX]"}, + { 7, {0x48, 0xc7, 0xc0, 0x73, 0x74, 0x75, 0x76}, 0, "48 C7 C0 XX XX XX XX : mov rax, XX XX XX XX"}, { 7, {0x48, 0xff, 0x25, 0x73, 0x74, 0x75, 0x76}, 3, "48 ff 25 XX XX XX XX : rex.W jmp QWORD PTR [rip + XXXXXXXX]"}, { 7, {0x4C, 0x8D, 0x15, 0x73, 0x74, 0x75, 0x76}, 3, "4c 8d 15 XX XX XX XX : lea r10, [rip + XX]"}, { 7, {0x81, 0x78, 0x72, 0x73, 0x74, 0x75, 0x76}, 0, "81 78 YY XX XX XX XX : cmp DWORD PTR [rax+YY], XX XX XX XX"}, @@ -1037,6 +1038,7 @@ const struct InstructionSizeData { { 8, {0x41, 0x81, 0x7d, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "41 81 7d XX YY YY YY YY : cmp DWORD PTR [r13+YY], XX XX XX XX"}, { 8, {0x41, 0x81, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "41 81 7e XX YY YY YY YY : cmp DWORD PTR [r14+YY], XX XX XX XX"}, { 8, {0x41, 0x81, 0x7f, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "41 81 7f YY XX XX XX XX : cmp DWORD PTR [r15+YY], XX XX XX XX"}, + { 8, {0x48, 0x8D, 0xA4, 0x24, 0x74, 0x75, 0x76, 0x77}, 0, "48 8D A4 24 XX XX XX XX : lea rsp, [rsp + XX XX XX XX]"}, { 8, {0x81, 0x7c, 0x24, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "81 7c 24 YY XX XX XX XX : cmp DWORD PTR [rsp+YY], XX XX XX XX"}, { 8, {0xc7, 0x44, 0x24, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "C7 44 24 XX YY YY YY YY : mov dword ptr [rsp + XX], YYYYYYYY"}, { 9, {0x41, 0x81, 0x7c, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78}, 0, "41 81 7c ZZ YY XX XX XX XX : cmp DWORD PTR [reg+reg*n+YY], XX XX XX XX"}, From 7211bf48a62bfe3a181013f412f2fa6e112ae99f Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Mon, 27 Jan 2025 10:51:23 +0100 Subject: [PATCH 169/432] [flang][driver] add negative from of -fsave-main-program (#124110) Add the `-fno` form for consistency and to make it easy to switch the default for downstream users. --- clang/include/clang/Driver/Options.td | 7 +++++-- clang/lib/Driver/ToolChains/Flang.cpp | 3 ++- flang/lib/Frontend/CompilerInvocation.cpp | 9 +++++---- flang/test/Driver/fsave-main-program.f90 | 6 +++++- flang/test/Lower/fsave-main-program.f90 | 1 + 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c5b7fcb7c7f09..6c171a62bbeee 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6970,8 +6970,11 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">; def fno_automatic : Flag<["-"], "fno-automatic">, Group, HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">; -def fsave_main_program : Flag<["-"], "fsave-main-program">, Group, - HelpText<"Place all variables from the main program in static memory (otherwise scalars may be placed on the stack)">; +defm save_main_program : BoolOptionWithoutMarshalling<"f", "save-main-program", + PosFlag, + NegFlag>; defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays", PosFlag, diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 45d05ed3e2485..1ae865f379110 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -58,7 +58,8 @@ void Flang::addFortranDialectOptions(const ArgList &Args, options::OPT_fhermetic_module_files, options::OPT_frealloc_lhs, options::OPT_fno_realloc_lhs, - options::OPT_fsave_main_program}); + options::OPT_fsave_main_program, + options::OPT_fno_save_main_program}); } void Flang::addPreprocessingOptions(const ArgList &Args, diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 3c6da4687f65d..68b5950d3a51b 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -770,10 +770,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave); } - // -fsave-main-program - if (args.hasArg(clang::driver::options::OPT_fsave_main_program)) { - opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram); - } + // -f{no}-save-main-program + opts.features.Enable( + Fortran::common::LanguageFeature::SaveMainProgram, + args.hasFlag(clang::driver::options::OPT_fsave_main_program, + clang::driver::options::OPT_fno_save_main_program, false)); if (args.hasArg( clang::driver::options::OPT_falternative_parameter_statement)) { diff --git a/flang/test/Driver/fsave-main-program.f90 b/flang/test/Driver/fsave-main-program.f90 index bffdfd97911e8..e7a2f9d8b470e 100644 --- a/flang/test/Driver/fsave-main-program.f90 +++ b/flang/test/Driver/fsave-main-program.f90 @@ -1,5 +1,9 @@ ! Check that the driver passes through -fsave-main-program: ! RUN: %flang -### -S -fsave-main-program %s -o - 2>&1 | FileCheck %s +! CHECK: "-fc1"{{.*}}"-fsave-main-program" + +! RUN: %flang -### -S -fno-save-main-program %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK2 +! CHECK2: "-fc1"{{.*}}"-fno-save-main-program" + ! Check that the compiler accepts -fsave-main-program: ! RUN: %flang_fc1 -emit-hlfir -fsave-main-program %s -o - -! CHECK: "-fc1"{{.*}}"-fsave-main-program" diff --git a/flang/test/Lower/fsave-main-program.f90 b/flang/test/Lower/fsave-main-program.f90 index 17fc1b02f5068..e89244c3c7c51 100644 --- a/flang/test/Lower/fsave-main-program.f90 +++ b/flang/test/Lower/fsave-main-program.f90 @@ -1,6 +1,7 @@ ! Test -fsave-main-program switch. ! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s ! RUN: %flang_fc1 -fsave-main-program -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-SAVE %s +! RUN: %flang_fc1 -fsave-main-program -fno-save-main-program -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s program test integer :: i call foo(i) From 3b5e9eed2f67c1fb6dcf7033e92509ba2b0381e9 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Mon, 27 Jan 2025 15:52:43 +0530 Subject: [PATCH 170/432] [NVPTX] Add float to tf32 conversion intrinsics (#124316) This patch adds the set of f32 -> tf32 cvt intrinsics introduced in sm100 with ptx8.6. This builds on top of the recent PR #121507. Tests are verified with a 12.8 ptxas executable. PTX ISA link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt Signed-off-by: Durgadoss R --- llvm/include/llvm/IR/IntrinsicsNVVM.td | 8 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 5 ++ llvm/test/CodeGen/NVPTX/convert-sm100.ll | 68 ++++++++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100.ll diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 68c2373a1a454..9a2f38d760e65 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1444,10 +1444,18 @@ let TargetPrefix = "nvvm" in { Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">, Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rn_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_satfinite">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rn_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_relu_satfinite">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">, Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">, Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rz_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_satfinite">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rz_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_relu_satfinite">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">, Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index f17799c130015..633a99d0fc1be 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -764,6 +764,11 @@ let hasSideEffects = false in { defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">; defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>; defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>; + + defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>; + defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>; + defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>; + defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>; } def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{ diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll new file mode 100644 index 0000000000000..f92822f7e0c16 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} + +declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1) +declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1) +declare i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1) +declare i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1) + +define i32 @cvt_rn_satf_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rn_satf_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_satf_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1) + ret i32 %val +} + +define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rn_relu_satf_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1) + ret i32 %val +} + +define i32 @cvt_rz_satf_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rz_satf_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_satf_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1) + ret i32 %val +} + +define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rz_relu_satf_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1) + ret i32 %val +} From 87103a016fbfd480e1d3bb8eba23c27a9c74e70d Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 27 Jan 2025 10:41:53 +0000 Subject: [PATCH 171/432] [AArch64] Implement NEON FP8 vectors as VectorType (#123603) Reimplement Neon FP8 vector types using attribute `neon_vector_type` instead of having them as builtin types. This allows to implement FP8 Neon intrinsics without the need to add special cases for these types when using `__builtin_shufflevector` or bitcast (using C-style cast operator) between vectors, both extensively used in the generated code in `arm_neon.h`. --- clang/include/clang/AST/Type.h | 5 + .../clang/Basic/AArch64SVEACLETypes.def | 2 - clang/include/clang/Basic/TargetBuiltins.h | 4 +- clang/lib/AST/ItaniumMangle.cpp | 5 + clang/lib/CodeGen/CGBuiltin.cpp | 1 + clang/lib/CodeGen/CGExpr.cpp | 11 +- clang/lib/CodeGen/CodeGenTypes.cpp | 4 +- clang/lib/CodeGen/Targets/AArch64.cpp | 7 +- clang/lib/Sema/SemaARM.cpp | 2 + clang/lib/Sema/SemaExpr.cpp | 7 +- clang/lib/Sema/SemaType.cpp | 3 +- .../AArch64/builtin-shufflevector-fp8.c | 123 +++++++++++ clang/test/CodeGen/AArch64/fp8-cast.c | 193 ++++++++++++++++++ clang/test/CodeGen/arm-mfp8.c | 88 ++++---- .../aarch64-mangle-neon-vectors.cpp | 7 + clang/test/CodeGenCXX/mangle-neon-vectors.cpp | 11 + clang/test/Sema/aarch64-fp8-cast.c | 104 ++++++++++ clang/test/Sema/arm-mfp8.cpp | 34 +-- clang/utils/TableGen/NeonEmitter.cpp | 11 +- 19 files changed, 553 insertions(+), 69 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c create mode 100644 clang/test/CodeGen/AArch64/fp8-cast.c create mode 100644 clang/test/Sema/aarch64-fp8-cast.c diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 3457d524c63aa..1d9743520654e 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { bool isFloat32Type() const; bool isDoubleType() const; bool isBFloat16Type() const; + bool isMFloat8Type() const; bool isFloat128Type() const; bool isIbm128Type() const; bool isRealType() const; // C99 6.2.5p17 (real floating + integer) @@ -8537,6 +8538,10 @@ inline bool Type::isBFloat16Type() const { return isSpecificBuiltinType(BuiltinType::BFloat16); } +inline bool Type::isMFloat8Type() const { + return isSpecificBuiltinType(BuiltinType::MFloat8); +} + inline bool Type::isFloat128Type() const { return isSpecificBuiltinType(BuiltinType::Float128); } diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def index 063cac1f4a58e..2dd2754e778d6 100644 --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -201,8 +201,6 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy) AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1) -AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, MFloat8x8Ty, 8, 8, 1) -AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloat8x16Ty, 16, 8, 1) #undef SVE_VECTOR_TYPE #undef SVE_VECTOR_TYPE_BFLOAT diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index 4dc8b24ed8ae6..83ef015018f1a 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -208,7 +208,8 @@ namespace clang { Float16, Float32, Float64, - BFloat16 + BFloat16, + MFloat8 }; NeonTypeFlags(unsigned F) : Flags(F) {} @@ -230,6 +231,7 @@ namespace clang { switch (getEltType()) { case Int8: case Poly8: + case MFloat8: return 8; case Int16: case Float16: diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 1dd936cf4fb51..9948963d7f44b 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3919,6 +3919,9 @@ void CXXNameMangler::mangleNeonVectorType(const VectorType *T) { case BuiltinType::Float: EltName = "float32_t"; break; case BuiltinType::Half: EltName = "float16_t"; break; case BuiltinType::BFloat16: EltName = "bfloat16_t"; break; + case BuiltinType::MFloat8: + EltName = "mfloat8_t"; + break; default: llvm_unreachable("unexpected Neon vector element type"); } @@ -3972,6 +3975,8 @@ static StringRef mangleAArch64VectorBase(const BuiltinType *EltType) { return "Float64"; case BuiltinType::BFloat16: return "Bfloat16"; + case BuiltinType::MFloat8: + return "Mfloat8"; default: llvm_unreachable("Unexpected vector element base type"); } diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 26bccccdc5e36..5162ac503b8eb 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6679,6 +6679,7 @@ static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF, switch (TypeFlags.getEltType()) { case NeonTypeFlags::Int8: case NeonTypeFlags::Poly8: + case NeonTypeFlags::MFloat8: return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad)); case NeonTypeFlags::Int16: case NeonTypeFlags::Poly16: diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 054f8d1eadb8c..e9f5497aaae24 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -2414,8 +2414,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst, Vec = Builder.CreateBitCast(Vec, IRVecTy); // iN --> . } - Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(), - Dst.getVectorIdx(), "vecins"); + llvm::Value *SrcVal = Src.getScalarVal(); + // Allow inserting `<1 x T>` into an ``. It can happen with scalar + // types which are mapped to vector LLVM IR types (e.g. for implementing + // an ABI). + if (auto *EltTy = dyn_cast(SrcVal->getType()); + EltTy && EltTy->getNumElements() == 1) + SrcVal = Builder.CreateBitCast(SrcVal, EltTy->getElementType()); + Vec = Builder.CreateInsertElement(Vec, SrcVal, Dst.getVectorIdx(), + "vecins"); if (IRStoreTy) { // --> . Vec = Builder.CreateBitCast(Vec, IRStoreTy); diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 09191a4901f49..950b23f4e13b9 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -650,7 +650,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { // An ext_vector_type of Bool is really a vector of bits. llvm::Type *IRElemTy = VT->isExtVectorBoolType() ? llvm::Type::getInt1Ty(getLLVMContext()) - : ConvertType(VT->getElementType()); + : (VT->getElementType()->isMFloat8Type() + ? llvm::Type::getInt8Ty(getLLVMContext()) + : ConvertType(VT->getElementType())); ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements()); break; } diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 7db67ecba07c8..c702e79ff8eb9 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -383,10 +383,6 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn, NSRN = std::min(NSRN + 1, 8u); else { switch (BT->getKind()) { - case BuiltinType::MFloat8x8: - case BuiltinType::MFloat8x16: - NSRN = std::min(NSRN + 1, 8u); - break; case BuiltinType::SveBool: case BuiltinType::SveCount: NPRN = std::min(NPRN + 1, 4u); @@ -629,8 +625,7 @@ bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { // but with the difference that any floating-point type is allowed, // including __fp16. if (const BuiltinType *BT = Ty->getAs()) { - if (BT->isFloatingPoint() || BT->getKind() == BuiltinType::MFloat8x16 || - BT->getKind() == BuiltinType::MFloat8x8) + if (BT->isFloatingPoint()) return true; } else if (const VectorType *VT = Ty->getAs()) { if (auto Kind = VT->getVectorKind(); diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp index db418d80e0e09..2620bbc97ba02 100644 --- a/clang/lib/Sema/SemaARM.cpp +++ b/clang/lib/Sema/SemaARM.cpp @@ -352,6 +352,8 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context, return Context.DoubleTy; case NeonTypeFlags::BFloat16: return Context.BFloat16Ty; + case NeonTypeFlags::MFloat8: + return Context.MFloat8Ty; } llvm_unreachable("Invalid NeonTypeFlag!"); } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index d5273d463d7c0..176627c3df37c 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -7503,7 +7503,7 @@ static bool breakDownVectorType(QualType type, uint64_t &len, if (const VectorType *vecType = type->getAs()) { len = vecType->getNumElements(); eltType = vecType->getElementType(); - assert(eltType->isScalarType()); + assert(eltType->isScalarType() || eltType->isMFloat8Type()); return true; } @@ -10174,6 +10174,11 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS, return HLSL().handleVectorBinOpConversion(LHS, RHS, LHSType, RHSType, IsCompAssign); + // Any operation with MFloat8 type is only possible with C intrinsics + if ((LHSVecType && LHSVecType->getElementType()->isMFloat8Type()) || + (RHSVecType && RHSVecType->getElementType()->isMFloat8Type())) + return InvalidOperands(Loc, LHS, RHS); + // AltiVec-style "vector bool op vector bool" combinations are allowed // for some operators but not others. if (!AllowBothBool && LHSVecType && diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 2ccf5a8e1d6f3..33d5378944ddb 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -8306,7 +8306,8 @@ static bool isPermittedNeonBaseType(QualType &Ty, VectorKind VecKind, Sema &S) { BTy->getKind() == BuiltinType::ULongLong || BTy->getKind() == BuiltinType::Float || BTy->getKind() == BuiltinType::Half || - BTy->getKind() == BuiltinType::BFloat16; + BTy->getKind() == BuiltinType::BFloat16 || + BTy->getKind() == BuiltinType::MFloat8; } static bool verifyValidIntegerConstantExpr(Sema &S, const ParsedAttr &Attr, diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c new file mode 100644 index 0000000000000..147ca1d1becc1 --- /dev/null +++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c @@ -0,0 +1,123 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s + +// REQUIRES: aarch64-registered-target + +typedef __attribute__((neon_vector_type(8))) signed char int8x8_t; +typedef __attribute__((neon_vector_type(16))) signed char int8x16_t; + +typedef __attribute__((neon_vector_type(8))) __mfp8 mfloat8x8_t; +typedef __attribute__((neon_vector_type(16))) __mfp8 mfloat8x16_t; + +// CHECK-LABEL: define dso_local <8 x i8> @test_8x8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE]] +// +mfloat8x8_t test_8x8(mfloat8x8_t x) { + return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_8x8_v( +// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7) +// CHECK-NEXT: [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0 +// CHECK-NEXT: [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX]] +// CHECK-NEXT: [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 [[SHUF_ELT]], i64 0 +// CHECK-NEXT: [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1 +// CHECK-NEXT: [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX1]] +// CHECK-NEXT: [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1 +// CHECK-NEXT: [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2 +// CHECK-NEXT: [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX4]] +// CHECK-NEXT: [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2 +// CHECK-NEXT: [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3 +// CHECK-NEXT: [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX7]] +// CHECK-NEXT: [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3 +// CHECK-NEXT: [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4 +// CHECK-NEXT: [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX10]] +// CHECK-NEXT: [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4 +// CHECK-NEXT: [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5 +// CHECK-NEXT: [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX13]] +// CHECK-NEXT: [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5 +// CHECK-NEXT: [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6 +// CHECK-NEXT: [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX16]] +// CHECK-NEXT: [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6 +// CHECK-NEXT: [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7 +// CHECK-NEXT: [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX19]] +// CHECK-NEXT: [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[SHUF_INS21]] +// +mfloat8x8_t test_8x8_v(mfloat8x8_t x, int8x8_t p) { + return __builtin_shufflevector(x, p); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_8x16( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE]] +// +mfloat8x16_t test_8x16(mfloat8x16_t x) { + return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, + 1, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_8x16_v( +// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15) +// CHECK-NEXT: [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0 +// CHECK-NEXT: [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX]] +// CHECK-NEXT: [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 [[SHUF_ELT]], i64 0 +// CHECK-NEXT: [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1 +// CHECK-NEXT: [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX1]] +// CHECK-NEXT: [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1 +// CHECK-NEXT: [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2 +// CHECK-NEXT: [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX4]] +// CHECK-NEXT: [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2 +// CHECK-NEXT: [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3 +// CHECK-NEXT: [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX7]] +// CHECK-NEXT: [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3 +// CHECK-NEXT: [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4 +// CHECK-NEXT: [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX10]] +// CHECK-NEXT: [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4 +// CHECK-NEXT: [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5 +// CHECK-NEXT: [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX13]] +// CHECK-NEXT: [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5 +// CHECK-NEXT: [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6 +// CHECK-NEXT: [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX16]] +// CHECK-NEXT: [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6 +// CHECK-NEXT: [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7 +// CHECK-NEXT: [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX19]] +// CHECK-NEXT: [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7 +// CHECK-NEXT: [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8 +// CHECK-NEXT: [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX22]] +// CHECK-NEXT: [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], i8 [[SHUF_ELT23]], i64 8 +// CHECK-NEXT: [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9 +// CHECK-NEXT: [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX25]] +// CHECK-NEXT: [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], i8 [[SHUF_ELT26]], i64 9 +// CHECK-NEXT: [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 10 +// CHECK-NEXT: [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX28]] +// CHECK-NEXT: [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], i8 [[SHUF_ELT29]], i64 10 +// CHECK-NEXT: [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 11 +// CHECK-NEXT: [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX31]] +// CHECK-NEXT: [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], i8 [[SHUF_ELT32]], i64 11 +// CHECK-NEXT: [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 12 +// CHECK-NEXT: [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX34]] +// CHECK-NEXT: [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], i8 [[SHUF_ELT35]], i64 12 +// CHECK-NEXT: [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 13 +// CHECK-NEXT: [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX37]] +// CHECK-NEXT: [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], i8 [[SHUF_ELT38]], i64 13 +// CHECK-NEXT: [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 14 +// CHECK-NEXT: [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX40]] +// CHECK-NEXT: [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], i8 [[SHUF_ELT41]], i64 14 +// CHECK-NEXT: [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 15 +// CHECK-NEXT: [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX43]] +// CHECK-NEXT: [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[SHUF_INS45]] +// +mfloat8x16_t test_8x16_v(mfloat8x16_t x, int8x16_t p) { + return __builtin_shufflevector(x, p); +} diff --git a/clang/test/CodeGen/AArch64/fp8-cast.c b/clang/test/CodeGen/AArch64/fp8-cast.c new file mode 100644 index 0000000000000..a9ce31b9e6bea --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-cast.c @@ -0,0 +1,193 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -S -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +// Bitcast between FP8 Neon vectors +// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_f813__Mfloat8x8_t( +// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <8 x i8> [[X]] +// +mfloat8x8_t test_f8_f8(mfloat8x8_t x) { + return (mfloat8x8_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_f814__Mfloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <16 x i8> [[X]] +// +mfloat8x16_t testq_f8_f8(mfloat8x16_t x) { + return (mfloat8x16_t) x; +} + +// Bitcast between FP8 and int8 Neon vectors +// CHECK-LABEL: define dso_local <8 x i8> @test_f8_s8( +// CHECK-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_s810__Int8x8_t( +// CHECK-CXX-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <8 x i8> [[X]] +// +mfloat8x8_t test_f8_s8(int8x8_t x) { + return (mfloat8x8_t) x; +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_s8_f8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z10test_s8_f813__Mfloat8x8_t( +// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <8 x i8> [[X]] +// +int8x8_t test_s8_f8(mfloat8x8_t x) { + return (int8x8_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_s8( +// CHECK-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_s811__Int8x16_t( +// CHECK-CXX-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <16 x i8> [[X]] +// +mfloat8x16_t testq_f8_s8(int8x16_t x) { + return (mfloat8x16_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_s8_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z11testq_s8_f814__Mfloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <16 x i8> [[X]] +// +int8x16_t testq_s8_f8(mfloat8x16_t x) { + return (int8x16_t) x; +} + +// Bitcast between FP8 and float32 Neon vectors +// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f32( +// CHECK-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z11test_f8_f3213__Float32x2_t( +// CHECK-CXX-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8> +// CHECK-CXX-NEXT: ret <8 x i8> [[TMP0]] +// +mfloat8x8_t test_f8_f32(float32x2_t x) { + return (mfloat8x8_t) x; +} + +// CHECK-LABEL: define dso_local <2 x float> @test_f32_f8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z11test_f32_f813__Mfloat8x8_t( +// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float> +// CHECK-CXX-NEXT: ret <2 x float> [[TMP0]] +// +float32x2_t test_f32_f8(mfloat8x8_t x) { + return (float32x2_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f32( +// CHECK-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z12testq_f8_f3213__Float32x4_t( +// CHECK-CXX-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// +mfloat8x16_t testq_f8_f32(float32x4_t x) { + return (mfloat8x16_t) x; +} + +// CHECK-LABEL: define dso_local <4 x float> @testq_f32_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z12testq_f32_f814__Mfloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float> +// CHECK-CXX-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t testq_f32_f8(mfloat8x16_t x) { + return (float32x4_t) x; +} + +// Bitcast between FP8 and poly128_t (which is integral) +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_p128( +// CHECK-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z13testq_f8_p128o( +// CHECK-CXX-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// +mfloat8x16_t testq_f8_p128(poly128_t x) { + return (mfloat8x16_t) x; +} + +// CHECK-LABEL: define dso_local i128 @testq_p128_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128 +// CHECK-NEXT: ret i128 [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local noundef i128 @_Z13testq_p128_f814__Mfloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128 +// CHECK-CXX-NEXT: ret i128 [[TMP0]] +// +poly128_t testq_p128_f8(mfloat8x16_t x) { + return (poly128_t) x; +} diff --git a/clang/test/CodeGen/arm-mfp8.c b/clang/test/CodeGen/arm-mfp8.c index bf91066335a25..9385b537f18b3 100644 --- a/clang/test/CodeGen/arm-mfp8.c +++ b/clang/test/CodeGen/arm-mfp8.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -o - -x c++ %s | FileCheck %s --check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -disable-O0-optnone -o - %s | opt -S --passes=mem2reg | FileCheck %s --check-prefixes=CHECK-C +// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -disable-O0-optnone -o - -x c++ %s | opt -S --passes=mem2reg | FileCheck %s --check-prefixes=CHECK-CXX // REQUIRES: aarch64-registered-target @@ -10,18 +10,12 @@ // CHECK-C-LABEL: define dso_local <16 x i8> @test_ret_mfloat8x16_t( // CHECK-C-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[V_ADDR:%.*]] = alloca <16 x i8>, align 16 -// CHECK-C-NEXT: store <16 x i8> [[V]], ptr [[V_ADDR]], align 16 -// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16 -// CHECK-C-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-C-NEXT: ret <16 x i8> [[V]] // -// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z21test_ret_mfloat8x16_tu14__MFloat8x16_t( +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z21test_ret_mfloat8x16_t14__Mfloat8x16_t( // CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[V_ADDR:%.*]] = alloca <16 x i8>, align 16 -// CHECK-CXX-NEXT: store <16 x i8> [[V]], ptr [[V_ADDR]], align 16 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16 -// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-CXX-NEXT: ret <16 x i8> [[V]] // mfloat8x16_t test_ret_mfloat8x16_t(mfloat8x16_t v) { return v; @@ -30,18 +24,12 @@ mfloat8x16_t test_ret_mfloat8x16_t(mfloat8x16_t v) { // CHECK-C-LABEL: define dso_local <8 x i8> @test_ret_mfloat8x8_t( // CHECK-C-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[V_ADDR:%.*]] = alloca <8 x i8>, align 8 -// CHECK-C-NEXT: store <8 x i8> [[V]], ptr [[V_ADDR]], align 8 -// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8 -// CHECK-C-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-C-NEXT: ret <8 x i8> [[V]] // -// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z20test_ret_mfloat8x8_tu13__MFloat8x8_t( +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z20test_ret_mfloat8x8_t13__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[V_ADDR:%.*]] = alloca <8 x i8>, align 8 -// CHECK-CXX-NEXT: store <8 x i8> [[V]], ptr [[V_ADDR]], align 8 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8 -// CHECK-CXX-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-CXX-NEXT: ret <8 x i8> [[V]] // mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) { return v; @@ -50,28 +38,22 @@ mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) { // CHECK-C-LABEL: define dso_local <1 x i8> @func1n( // CHECK-C-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[MFP8_ADDR:%.*]] = alloca <1 x i8>, align 1 // CHECK-C-NEXT: [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1 -// CHECK-C-NEXT: store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1 -// CHECK-C-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[MFP8_ADDR]], align 1 // CHECK-C-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 -// CHECK-C-NEXT: store <1 x i8> [[TMP0]], ptr [[ARRAYIDX]], align 1 +// CHECK-C-NEXT: store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1 // CHECK-C-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 -// CHECK-C-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 -// CHECK-C-NEXT: ret <1 x i8> [[TMP1]] +// CHECK-C-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 +// CHECK-C-NEXT: ret <1 x i8> [[TMP0]] // // CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z6func1nu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[MFP8_ADDR:%.*]] = alloca <1 x i8>, align 1 // CHECK-CXX-NEXT: [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1 -// CHECK-CXX-NEXT: store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[MFP8_ADDR]], align 1 // CHECK-CXX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 -// CHECK-CXX-NEXT: store <1 x i8> [[TMP0]], ptr [[ARRAYIDX]], align 1 +// CHECK-CXX-NEXT: store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1 // CHECK-CXX-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 -// CHECK-CXX-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 -// CHECK-CXX-NEXT: ret <1 x i8> [[TMP1]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 +// CHECK-CXX-NEXT: ret <1 x i8> [[TMP0]] // __mfp8 func1n(__mfp8 mfp8) { __mfp8 f1n[10]; @@ -79,7 +61,43 @@ __mfp8 func1n(__mfp8 mfp8) { return f1n[2]; } +// CHECK-C-LABEL: define dso_local <1 x i8> @test_extract_element( +// CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 +// CHECK-C-NEXT: [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]] +// CHECK-C-NEXT: store i8 [[VECEXT]], ptr [[RETVAL]], align 1 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 +// CHECK-C-NEXT: ret <1 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z20test_extract_element14__Mfloat8x16_ti( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 +// CHECK-CXX-NEXT: [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]] +// CHECK-CXX-NEXT: store i8 [[VECEXT]], ptr [[RETVAL]], align 1 +// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 +// CHECK-CXX-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_extract_element(mfloat8x16_t x, int i) { + return x[i]; +} - -//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -// CHECK: {{.*}} +// CHECK-C-LABEL: define dso_local <16 x i8> @test_insert_element( +// CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8 +// CHECK-C-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]] +// CHECK-C-NEXT: ret <16 x i8> [[VECINS]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z19test_insert_element14__Mfloat8x16_tiu6__mfp8( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8 +// CHECK-CXX-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]] +// CHECK-CXX-NEXT: ret <16 x i8> [[VECINS]] +// +mfloat8x16_t test_insert_element(mfloat8x16_t x, int i, mfloat8_t v) { + x[i] = v; + return x; +} diff --git a/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp index 3b4a309327fe6..9b855698f57fd 100644 --- a/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp +++ b/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp @@ -11,6 +11,7 @@ typedef unsigned short poly16_t; typedef __fp16 float16_t; typedef float float32_t; typedef double float64_t; +typedef __mfp8 mfloat8_t; typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t; typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; @@ -26,6 +27,8 @@ typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; typedef __attribute__((neon_vector_type(2))) unsigned int uint32x2_t; typedef __attribute__((neon_vector_type(4))) unsigned int uint32x4_t; typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; +typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t; +typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t; typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t; typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t; typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t; @@ -82,3 +85,7 @@ void f21(int64x2_t) {} void f22(uint64x2_t) {} // CHECK: 13__Float64x2_t void f23(float64x2_t) {} +// CHECK: 13__Mfloat8x8_t +void f24(mfloat8x8_t) {} +// CHECK: 14__Mfloat8x16_t +void f25(mfloat8x16_t) {} diff --git a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp index cb5e40be6a6df..2139a8ae98caf 100644 --- a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp +++ b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp @@ -9,6 +9,7 @@ typedef __fp16 float16_t; #if defined(__aarch64__) typedef unsigned char poly8_t; typedef unsigned short poly16_t; +typedef __mfp8 mfloat8_t; #else typedef signed char poly8_t; typedef short poly16_t; @@ -29,6 +30,8 @@ typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t; typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t; #ifdef __aarch64__ typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t; +typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t; +typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t; #endif typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t; typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t; @@ -86,3 +89,11 @@ void f11(float64x2_t v) { } // CHECK-AARCH64-BF16: 14__Bfloat16x4_t void f12(bfloat16x4_t v) {} #endif + + +#ifdef __aarch64__ +// CHECK-AARCH64: 13__Mfloat8x8_t +void f13(mfloat8x8_t v) { } +// CHECK-AARCH64: 14__Mfloat8x16_t +void f14(mfloat8x16_t v) { } +#endif diff --git a/clang/test/Sema/aarch64-fp8-cast.c b/clang/test/Sema/aarch64-fp8-cast.c new file mode 100644 index 0000000000000..ad25401919b5a --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-cast.c @@ -0,0 +1,104 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s + +// REQUIRES: aarch64-registered-target + +#include + +// Bitcast between FP8 Neon vectors +mfloat8x8_t err_test_f8_f8(mfloat8x16_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) of different size}} +} + +mfloat8x16_t err_testq_f8_f8(mfloat8x8_t x) { + return (mfloat8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) of different size}} +} + +// Bitcast between FP8 and int8 Neon vectors +mfloat8x8_t err_test_f8_s8(int8x16_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'int8x16_t' (vector of 16 'int8_t' values) of different size}} +} + +int8x8_t err_test_s8_f8(mfloat8x16_t x) { + return (int8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'int8x8_t' (vector of 8 'int8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) of different size}} +} + +mfloat8x16_t err_testq_f8_s8(int8x8_t x) { + return (mfloat8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'int8x8_t' (vector of 8 'int8_t' values) of different size}} +} + +int8x16_t err_testq_s8_f8(mfloat8x8_t x) { + return (int8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'int8x16_t' (vector of 16 'int8_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) of different size}} +} + +// Bitcast between FP8 and float32 Neon vectors +mfloat8x8_t err_test_f8_f32(float32x4_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'float32x4_t' (vector of 4 'float32_t' values) of different size}} +} + +float32x2_t err_test_f32_f8(mfloat8x16_t x) { + return (float32x2_t) x; +// expected-error@-1 {{invalid conversion between vector type 'float32x2_t' (vector of 2 'float32_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) of different size}} +} + +mfloat8x16_t err_testq_f8_f32(float32x2_t x) { + return (mfloat8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'float32x2_t' (vector of 2 'float32_t' values) of different size}} +} + +float32x4_t err_testq_f32_f8(mfloat8x8_t x) { + return (float32x4_t) x; +// expected-error@-1 {{invalid conversion between vector type 'float32x4_t' (vector of 4 'float32_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) of different size}} +} + +// Bitcast between FP8 and poly128_t (which is integral) +mfloat8x8_t err_testq_f8_p128(poly128_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and integer type 'poly128_t' (aka 'unsigned __int128') of different size}} +} + +poly128_t err_testq_p128_f8(mfloat8x8_t x) { + return (poly128_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and integer type 'poly128_t' (aka 'unsigned __int128') of different size}} +} + +// Bitcast between FP8 and a non-integral type +mfloat8x8_t err_test_f8_ptr(void *p) { + return (mfloat8x8_t) p; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'void *'}} +} + +void *err_test_ptr_f8(mfloat8x8_t v) { + return (void *) v; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'void *'}} +} + +mfloat8x8_t err_test_f8_dbl(double v) { + return (mfloat8x8_t) v; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'double'}} +} + +double err_test_dbl_f8(mfloat8x8_t v) { + return (double) v; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'double'}} +} + +struct S { + char ch[16]; +}; + +mfloat8x16_t err_test_f8_agg(struct S s) { + return (mfloat8x16_t) s; +// expected-error@-1 {{operand of type 'struct S' where arithmetic or pointer type is required}} +} + +struct S err_test_agg_f8(mfloat8x16_t v) { + return (struct S) v; +// expected-error@-1 {{used type 'struct S' where arithmetic or pointer type is required}} +} diff --git a/clang/test/Sema/arm-mfp8.cpp b/clang/test/Sema/arm-mfp8.cpp index be5bc9bb71dbd..1b4e6791420ec 100644 --- a/clang/test/Sema/arm-mfp8.cpp +++ b/clang/test/Sema/arm-mfp8.cpp @@ -48,17 +48,27 @@ void test_vector_sve(svmfloat8_t a, svuint8_t c) { #include void test_vector(mfloat8x8_t a, mfloat8x16_t b, uint8x8_t c) { - a + b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} - a - b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} - a * b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} - a / b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} + a + a; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}} + a - a; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}} + a * a; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}} + a / a; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}} - a + c; // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}} - a - c; // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}} - a * c; // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}} - a / c; // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}} - c + b; // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} - c - b; // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} - c * b; // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} - c / b; // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}} + b + b; // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}} + b - b; // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}} + b * b; // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}} + b / b; // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}} + + a + b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + a - b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + a * b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + a / b; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + + a + c; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}} + a - c; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}} + a * c; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}} + a / c; // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}} + c + b; // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + c - b; // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + c * b; // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} + c / b; // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}} } diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 49633bb7b7f58..7299a49252f0d 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -102,7 +102,7 @@ enum EltType { Float32, Float64, BFloat16, - MFloat8 // Not used by Sema or CodeGen in Clang + MFloat8 }; } // end namespace NeonTypeFlags @@ -2281,9 +2281,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) { InIfdef = true; } - if (T.isMFloat8()) - OS << "typedef __MFloat8x"; - else if (T.isPoly()) + if (T.isPoly()) OS << "typedef __attribute__((neon_polyvector_type("; else OS << "typedef __attribute__((neon_vector_type("; @@ -2291,10 +2289,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) { Type T2 = T; T2.makeScalar(); OS << T.getNumElements(); - if (T.isMFloat8()) - OS << "_t "; - else - OS << "))) " << T2.str(); + OS << "))) " << T2.str(); OS << " " << T.str() << ";\n"; } if (InIfdef) From 8f17f51deb12456f25d32b9a42ac1f00feabbfbc Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Mon, 27 Jan 2025 18:50:53 +0800 Subject: [PATCH 172/432] [mlir][tosa] Fix comments format(NFC) (#124520) This PR corrects the formatting of comments in Markdown. The previous format was as follows: https://mlir.llvm.org/docs/Dialects/TOSA/#tosaerf-mlirtosaerfop ![image](https://github.com/user-attachments/assets/1d1d10d5-c960-4724-9fb4-29c17ea39b11) https://mlir.llvm.org/docs/Dialects/TOSA/#tosarescale-mlirtosarescaleop ![image](https://github.com/user-attachments/assets/fb23cbf6-be10-4a60-8b43-b28dc2db6918) --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 35 ++++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 92ab729f5b933..2186510e7db1e 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -454,7 +454,7 @@ def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> { let summary = "Computes gauss error function of input"; let description = [{ - Gauss error function: $ erf(x) = \frac{2}{\sqrt(\pi)} \int_{0}^{x} e^{-t^2} \,dt $ + Gauss error function: $ erf(x) = \frac{2}{\sqrt(\pi)} \int_{0}^{x} e^{-t^2} \ dt $ For quantized integer data types, the TABLE operator should be used instead with the following definition. The erf_table has 513 entries each of 16-bit/8-bit precision and covering the input range -4.0 to +4.0 in steps of 1/64. @@ -1886,23 +1886,22 @@ def Tosa_RescaleOp: Tosa_Op<"rescale", [Pure, let description = [{ Rescale quantized values into a new domain. Supported rescalings are: - | Mode | Input | Output | Unsigned | Unsigned | - | | | | input | output | - |------------------------|-------|--------|----------|----------| - | signed 8 to 8 | int8 | int8 | false | false | - | signed 8 to 16 | int8 | int16 | false | false | - | signed 8 to 32 | int8 | int32 | false | false | - | signed 16 to 8 | int16 | int8 | false | false | - | signed 16 to 16 | int16 | int16 | false | false | - | signed 16 to 32 | int16 | int32 | false | false | - | signed 32 to 8 | int32 | int8 | false | false | - | signed 32 to 16 | int32 | int16 | false | false | - | signed 32 to 32 | int32 | int32 | false | false | - | signed 48 to 8 | int48 | int8 | false | false | - | signed 48 to 16 | int48 | int16 | false | false | - | signed 48 to 32 | int48 | int32 | false | false | - | unsigned 8 to signed 8 | uint8 | int8 | true | false | - | signed 8 to unsigned 8 | int8 | uint8 | false | true | + | Mode | Input | Output | Unsigned input | Unsigned output | + |------------------------|-------|--------|----------------|-----------------| + | signed 8 to 8 | int8 | int8 | false | false | + | signed 8 to 16 | int8 | int16 | false | false | + | signed 8 to 32 | int8 | int32 | false | false | + | signed 16 to 8 | int16 | int8 | false | false | + | signed 16 to 16 | int16 | int16 | false | false | + | signed 16 to 32 | int16 | int32 | false | false | + | signed 32 to 8 | int32 | int8 | false | false | + | signed 32 to 16 | int32 | int16 | false | false | + | signed 32 to 32 | int32 | int32 | false | false | + | signed 48 to 8 | int48 | int8 | false | false | + | signed 48 to 16 | int48 | int16 | false | false | + | signed 48 to 32 | int48 | int32 | false | false | + | unsigned 8 to signed 8 | uint8 | int8 | true | false | + | signed 8 to unsigned 8 | int8 | uint8 | false | true | }]; let arguments = (ins From 14ffff384740f484b382a1225f4bd01aeebfdc3f Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 27 Jan 2025 11:54:06 +0100 Subject: [PATCH 173/432] [clang] Add dump() support for lvalue APValues (#124476) Add some lvalue information to the `dump()` output of lvalue APValues. --- clang/lib/AST/TextNodeDumper.cpp | 30 ++++++++++++- clang/test/AST/ast-dump-APValue-lvalue.cpp | 50 ++++++++++++++++++++++ clang/test/AST/ast-dump-APValue-todo.cpp | 4 -- 3 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 clang/test/AST/ast-dump-APValue-lvalue.cpp diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 670641242cae2..46ec553fc05f0 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -710,10 +710,36 @@ void TextNodeDumper::Visit(const APValue &Value, QualType Ty) { << GetApproxValue(Value.getComplexFloatImag()) << 'i'; } return; - case APValue::LValue: + case APValue::LValue: { (void)Context; - OS << "LValue "; + OS << "LValue Base="; + APValue::LValueBase B = Value.getLValueBase(); + if (B.isNull()) + OS << "null"; + else if (const auto *BE = B.dyn_cast()) { + OS << BE->getStmtClassName() << ' '; + dumpPointer(BE); + } else { + const auto *VDB = B.get(); + OS << VDB->getDeclKindName() << "Decl"; + dumpPointer(VDB); + } + OS << ", Null=" << Value.isNullPointer() + << ", Offset=" << Value.getLValueOffset().getQuantity() + << ", HasPath=" << Value.hasLValuePath(); + if (Value.hasLValuePath()) { + OS << ", PathLength=" << Value.getLValuePath().size(); + OS << ", Path=("; + llvm::ListSeparator Sep; + for (const auto &PathEntry : Value.getLValuePath()) { + // We're printing all entries as array indices because don't have the + // type information here to do anything else. + OS << Sep << PathEntry.getAsArrayIndex(); + } + OS << ")"; + } return; + } case APValue::Array: { unsigned ArraySize = Value.getArraySize(); unsigned NumInitializedElements = Value.getArrayInitializedElts(); diff --git a/clang/test/AST/ast-dump-APValue-lvalue.cpp b/clang/test/AST/ast-dump-APValue-lvalue.cpp new file mode 100644 index 0000000000000..224caddb3eabe --- /dev/null +++ b/clang/test/AST/ast-dump-APValue-lvalue.cpp @@ -0,0 +1,50 @@ +// Test without serialization: +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -std=gnu++17 \ +// RUN: -ast-dump %s -ast-dump-filter Test \ +// RUN: | FileCheck --strict-whitespace --match-full-lines %s +// +// Test with serialization: +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -std=gnu++17 -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -triple x86_64-unknown-unknown -Wno-unused-value -std=gnu++17 \ +// RUN: -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \ +// RUN: | sed -e "s/ //" -e "s/ imported//" \ +// RUN: | FileCheck --strict-whitespace --match-full-lines %s + +int i; +struct S { + int i; + int ii; +}; +S s; + +struct F { + char padding[12]; + S s; +}; +F f; + +void Test(int (&arr)[10]) { + constexpr int *pi = &i; + // CHECK: | `-VarDecl {{.*}} col:{{.*}} pi 'int *const' constexpr cinit + // CHECK-NEXT: | |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=0, HasPath=1, PathLength=0, Path=() + + constexpr int *psi = &s.i; + // CHECK: | `-VarDecl {{.*}} col:{{.*}} psi 'int *const' constexpr cinit + // CHECK-NEXT: | |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=0, HasPath=1, PathLength=1, Path=({{.*}}) + + constexpr int *psii = &s.ii; + // CHECK: | `-VarDecl {{.*}} col:{{.*}} psii 'int *const' constexpr cinit + // CHECK-NEXT: | |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=4, HasPath=1, PathLength=1, Path=({{.*}}) + + constexpr int *pf = &f.s.ii; + // CHECK: | `-VarDecl {{.*}} col:{{.*}} pf 'int *const' constexpr cinit + // CHECK-NEXT: | |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=16, HasPath=1, PathLength=2, Path=({{.*}}, {{.*}}) + + constexpr char *pc = &f.padding[2]; + // CHECK: | `-VarDecl {{.*}} col:{{.*}} pc 'char *const' constexpr cinit + // CHECK-NEXT: | |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=2, HasPath=1, PathLength=2, Path=({{.*}}, 2) + + constexpr const int *n = nullptr; + // CHECK: `-VarDecl {{.*}} col:{{.*}} n 'const int *const' constexpr cinit + // CHECK-NEXT: |-value: LValue Base=null, Null=1, Offset=0, HasPath=1, PathLength=0, Path=() +} diff --git a/clang/test/AST/ast-dump-APValue-todo.cpp b/clang/test/AST/ast-dump-APValue-todo.cpp index 78cc9cf36c73c..acaa82ba53b6f 100644 --- a/clang/test/AST/ast-dump-APValue-todo.cpp +++ b/clang/test/AST/ast-dump-APValue-todo.cpp @@ -16,10 +16,6 @@ struct S { }; void Test() { - constexpr int *pi = &i; - // CHECK: | `-VarDecl {{.*}} col:{{.*}} pi 'int *const' constexpr cinit - // CHECK-NEXT: | |-value: LValue - constexpr int(S::*pmi) = &S::i; // CHECK: `-VarDecl {{.*}} col:{{.*}} pmi 'int (S::*const)' constexpr cinit // CHECK-NEXT: |-value: MemberPointer From 43a50deb63453cd3c800f097514d500536f9d436 Mon Sep 17 00:00:00 2001 From: Samuel Ginzburg Date: Mon, 27 Jan 2025 05:58:26 -0500 Subject: [PATCH 174/432] [MLIR][ROCDL] Add GFX940 SMFMAC (2:4 sparsity) instructions to the ROCDL dialect (#124435) # Overview This PR adds 2:4 structured sparsity (sparse A, dense B) matrix multiply instructions to ROCDL. # Testing I've added tests to Dialect/mlir and Target/mlir --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 18 ++++ mlir/test/Dialect/LLVMIR/rocdl.mlir | 87 +++++++++++++++++++ mlir/test/Target/LLVMIR/rocdl.mlir | 89 ++++++++++++++++++++ 3 files changed, 194 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 95fbe7ed66a43..974712c581537 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -408,6 +408,24 @@ def ROCDL_mfma_i32_32x32x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x32.i8">; def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; + +// 2:4 Sparsity ops (GFX940) +def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; +def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; +def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; +def ROCDL_smfmac_f32_32x32x16_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.bf16">; +def ROCDL_smfmac_i32_16x16x64_i8 : ROCDL_Mfma_IntrOp<"smfmac.i32.16x16x64.i8">; +def ROCDL_smfmac_i32_32x32x32_i8 : ROCDL_Mfma_IntrOp<"smfmac.i32.32x32x32.i8">; +def ROCDL_smfmac_f32_16x16x64_bf8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.bf8.bf8">; +def ROCDL_smfmac_f32_16x16x64_bf8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.bf8.fp8">; +def ROCDL_smfmac_f32_16x16x64_fp8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.fp8.bf8">; +def ROCDL_smfmac_f32_16x16x64_fp8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.fp8.fp8">; +def ROCDL_smfmac_f32_32x32x32_bf8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.bf8.bf8">; +def ROCDL_smfmac_f32_32x32x32_bf8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.bf8.fp8">; +def ROCDL_smfmac_f32_32x32x32_fp8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.fp8.bf8">; +def ROCDL_smfmac_f32_32x32x32_fp8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.fp8.fp8">; + + //===---------------------------------------------------------------------===// // WMMA intrinsics class ROCDL_Wmma_IntrOp overloadedOperands, diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 712f8c2a1caf6..5186e43398f01 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -258,6 +258,93 @@ func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, llvm.return } + +llvm.func @rocdl.smfmac(%arg0 : i32, + %arg1 : vector<4 x f16>, + %arg2 : vector<8 x f16>, + %arg3 : vector<4 x f32>, + %arg4 : vector<16 x f32>, + %arg5 : vector<4 x i16>, + %arg6 : vector<8 x i16>, + %arg7 : vector<2xi32>, + %arg8 : vector<4xi32>, + %arg9 : vector<16xi32>) -> vector<4 x f32> { + %csti32 = llvm.mlir.constant(42 : i32) : i32 + + // CHECK-LABEL: rocdl.smfmac + // CHECK: rocdl.smfmac.f32.16x16x32.f16 %{{.*}} : (vector<4xf16>, vector<8xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r0 = rocdl.smfmac.f32.16x16x32.f16 %arg1, %arg2, %arg3, %csti32, %csti32, %csti32 : + (vector<4xf16>, vector<8xf16>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.smfmac.f32.32x32x16.f16 %{{.*}} : (vector<4xf16>, vector<8xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r1 = rocdl.smfmac.f32.32x32x16.f16 %arg1, %arg2, %arg4, %csti32, %csti32, %csti32 : + (vector<4xf16>, vector<8xf16>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: rocdl.smfmac.f32.16x16x32.bf16 %{{.*}} : (vector<4xi16>, vector<8xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r2 = rocdl.smfmac.f32.16x16x32.bf16 %arg5, %arg6, %arg3, %csti32, %csti32, %csti32 : + (vector<4xi16>, vector<8xi16>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.smfmac.f32.32x32x16.bf16 %{{.*}} : (vector<4xi16>, vector<8xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r3 = rocdl.smfmac.f32.32x32x16.bf16 %arg5, %arg6, %arg4, %csti32, %csti32, %csti32 : + (vector<4xi16>, vector<8xi16>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: rocdl.smfmac.i32.16x16x64.i8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xi32>, i32, i32, i32) -> vector<4xi32> + %r4 = rocdl.smfmac.i32.16x16x64.i8 %arg7, %arg8, %arg8, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xi32>, + i32, i32, i32) -> vector<4xi32> + + // CHECK: rocdl.smfmac.i32.32x32x32.i8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xi32>, i32, i32, i32) -> vector<16xi32> + %r5 = rocdl.smfmac.i32.32x32x32.i8 %arg7, %arg8, %arg9, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xi32>, + i32, i32, i32) -> vector<16xi32> + + // CHECK: rocdl.smfmac.f32.16x16x64.bf8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r6 = rocdl.smfmac.f32.16x16x64.bf8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.smfmac.f32.16x16x64.bf8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r7 = rocdl.smfmac.f32.16x16x64.bf8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.smfmac.f32.16x16x64.fp8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r8 = rocdl.smfmac.f32.16x16x64.fp8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.smfmac.f32.16x16x64.fp8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + %r9 = rocdl.smfmac.f32.16x16x64.fp8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: rocdl.smfmac.f32.32x32x32.bf8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r10 = rocdl.smfmac.f32.32x32x32.bf8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: rocdl.smfmac.f32.32x32x32.bf8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r11 = rocdl.smfmac.f32.32x32x32.bf8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: rocdl.smfmac.f32.32x32x32.fp8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r12 = rocdl.smfmac.f32.32x32x32.fp8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: rocdl.smfmac.f32.32x32x32.fp8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + %r13 = rocdl.smfmac.f32.32x32x32.fp8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + llvm.return %r0 : vector<4 x f32> +} + llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32, %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>, %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) { diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index b74edb6210683..326bd3ae6b6f8 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -398,6 +398,95 @@ llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, llvm.return %r0 : vector<32 x f32> } +llvm.func @rocdl.smfmac(%arg0 : i32, + %arg1 : vector<4 x f16>, + %arg2 : vector<8 x f16>, + %arg3 : vector<4 x f32>, + %arg4 : vector<16 x f32>, + %arg5 : vector<4 x i16>, + %arg6 : vector<8 x i16>, + %arg7 : vector<2xi32>, + %arg8 : vector<4xi32>, + %arg9 : vector<16xi32>) -> vector<4 x f32> { + %csti32 = llvm.mlir.constant(42 : i32) : i32 + + // CHECK-LABEL: rocdl.smfmac + + // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %{{.*}}, <8 x half> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r0 = rocdl.smfmac.f32.16x16x32.f16 %arg1, %arg2, %arg3, %csti32, %csti32, %csti32 : + (vector<4xf16>, vector<8xf16>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %{{.*}}, <8 x half> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r1 = rocdl.smfmac.f32.32x32x16.f16 %arg1, %arg2, %arg4, %csti32, %csti32, %csti32 : + (vector<4xf16>, vector<8xf16>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r2 = rocdl.smfmac.f32.16x16x32.bf16 %arg5, %arg6, %arg3, %csti32, %csti32, %csti32 : + (vector<4xi16>, vector<8xi16>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r3 = rocdl.smfmac.f32.32x32x16.bf16 %arg5, %arg6, %arg4, %csti32, %csti32, %csti32 : + (vector<4xi16>, vector<8xi16>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 42, i32 42, i32 42) + %r4 = rocdl.smfmac.i32.16x16x64.i8 %arg7, %arg8, %arg8, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xi32>, + i32, i32, i32) -> vector<4xi32> + + // CHECK: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> %{{.*}}, i32 42, i32 42, i32 42) + %r5 = rocdl.smfmac.i32.32x32x32.i8 %arg7, %arg8, %arg9, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xi32>, + i32, i32, i32) -> vector<16xi32> + + // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r6 = rocdl.smfmac.f32.16x16x64.bf8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r7 = rocdl.smfmac.f32.16x16x64.bf8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r8 = rocdl.smfmac.f32.16x16x64.fp8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r9 = rocdl.smfmac.f32.16x16x64.fp8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<4xf32>, + i32, i32, i32) -> vector<4xf32> + + // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r10 = rocdl.smfmac.f32.32x32x32.bf8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r11 = rocdl.smfmac.f32.32x32x32.bf8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r12 = rocdl.smfmac.f32.32x32x32.fp8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + + // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42) + %r13 = rocdl.smfmac.f32.32x32x32.fp8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 : + (vector<2xi32>, vector<4xi32>, vector<16xf32>, + i32, i32, i32) -> vector<16xf32> + + llvm.return %r0 : vector<4 x f32> +} + + llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32, %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>, %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) -> vector<16 x f32> { From ac87d6b03642eca3901a7776d73be368299402e9 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Mon, 27 Jan 2025 12:28:09 +0100 Subject: [PATCH 175/432] [mlir][arith] Fold `arith.cmpi eq, %val, %one : i1` -> `%val` and `arith.cmpi ne, %val, %zero : i1 -> %val` (#124436) https://alive2.llvm.org/ce/z/dNZMdC --- flang/test/Lower/Intrinsics/ieee_next.f90 | 10 ++-- mlir/lib/Dialect/Arith/IR/ArithOps.cpp | 12 ++++ mlir/test/Dialect/Arith/canonicalize.mlir | 72 +++++++++++++++++++++++ 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/flang/test/Lower/Intrinsics/ieee_next.f90 b/flang/test/Lower/Intrinsics/ieee_next.f90 index fa9692b83bc87..eb9cc028368a5 100644 --- a/flang/test/Lower/Intrinsics/ieee_next.f90 +++ b/flang/test/Lower/Intrinsics/ieee_next.f90 @@ -131,9 +131,8 @@ program p ! CHECK: %[[V_106:[0-9]+]] = arith.bitcast %[[V_104]] : f32 to i32 ! CHECK: %[[V_107:[0-9]+]] = arith.shrui %[[V_106]], %c31{{.*}} : i32 ! CHECK: %[[V_108:[0-9]+]] = fir.convert %[[V_107]] : (i32) -> i1 - ! CHECK: %[[V_109:[0-9]+]] = arith.cmpi ne, %[[V_108]], %false{{[_0-9]*}} : i1 ! CHECK: %[[V_110:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 516 : i32}> : (f32) -> i1 - ! CHECK: %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_109]] : i1 + ! CHECK: %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_108]] : i1 ! CHECK: %[[V_112:[0-9]+]] = arith.ori %[[V_105]], %[[V_111]] : i1 ! CHECK: %[[V_113:[0-9]+]] = fir.if %[[V_112]] -> (f32) { ! CHECK: %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 1 : i32}> : (f32) -> i1 @@ -149,7 +148,7 @@ program p ! CHECK: } else { ! CHECK-DAG: %[[V_204:[0-9]+]] = arith.subi %[[V_106]], %c1{{.*}} : i32 ! CHECK-DAG: %[[V_205:[0-9]+]] = arith.addi %[[V_106]], %c1{{.*}} : i32 - ! CHECK: %[[V_206:[0-9]+]] = arith.select %[[V_109]], %[[V_205]], %[[V_204]] : i32 + ! CHECK: %[[V_206:[0-9]+]] = arith.select %[[V_108]], %[[V_205]], %[[V_204]] : i32 ! CHECK: %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i32 to f32 ! CHECK: fir.result %[[V_207]] : f32 ! CHECK: } @@ -253,9 +252,8 @@ program p ! CHECK: %[[V_182:[0-9]+]] = arith.bitcast %[[V_180]] : f128 to i128 ! CHECK: %[[V_183:[0-9]+]] = arith.shrui %[[V_182]], %c127{{.*}} : i128 ! CHECK: %[[V_184:[0-9]+]] = fir.convert %[[V_183]] : (i128) -> i1 - ! CHECK: %[[V_185:[0-9]+]] = arith.cmpi ne, %[[V_184]], %false{{[_0-9]*}} : i1 ! CHECK: %[[V_186:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 516 : i32}> : (f128) -> i1 - ! CHECK: %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_185]] : i1 + ! CHECK: %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_184]] : i1 ! CHECK: %[[V_188:[0-9]+]] = arith.ori %[[V_181]], %[[V_187]] : i1 ! CHECK: %[[V_189:[0-9]+]] = fir.if %[[V_188]] -> (f128) { ! CHECK: %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 1 : i32}> : (f128) -> i1 @@ -271,7 +269,7 @@ program p ! CHECK: } else { ! CHECK-DAG: %[[V_204:[0-9]+]] = arith.subi %[[V_182]], %c1{{.*}} : i128 ! CHECK-DAG: %[[V_205:[0-9]+]] = arith.addi %[[V_182]], %c1{{.*}} : i128 - ! CHECK: %[[V_206:[0-9]+]] = arith.select %[[V_185]], %[[V_205]], %[[V_204]] : i128 + ! CHECK: %[[V_206:[0-9]+]] = arith.select %[[V_184]], %[[V_205]], %[[V_204]] : i128 ! CHECK: %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i128 to f128 ! CHECK: fir.result %[[V_207]] : f128 ! CHECK: } diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index 7ca104691e6df..75d59ba8c1a10 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -1865,6 +1865,18 @@ OpFoldResult arith::CmpIOp::fold(FoldAdaptor adaptor) { getPredicate() == arith::CmpIPredicate::ne) return extOp.getOperand(); } + + // arith.cmpi ne, %val, %zero : i1 -> %val + if (getElementTypeOrSelf(getLhs().getType()).isInteger(1) && + getPredicate() == arith::CmpIPredicate::ne) + return getLhs(); + } + + if (matchPattern(adaptor.getRhs(), m_One())) { + // arith.cmpi eq, %val, %one : i1 -> %val + if (getElementTypeOrSelf(getLhs().getType()).isInteger(1) && + getPredicate() == arith::CmpIPredicate::eq) + return getLhs(); } // Move constant to the right side. diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 522711b08f289..3a16ee3d4f8fd 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -160,6 +160,78 @@ func.func @selNotCond(%arg0: i1, %arg1 : i32, %arg2 : i32, %arg3 : i32, %arg4 : return %res1, %res2 : i32, i32 } +// CHECK-LABEL: @cmpiI1eq +// CHECK-SAME: (%[[ARG:.*]]: i1) +// CHECK: return %[[ARG]] +func.func @cmpiI1eq(%arg0: i1) -> i1 { + %one = arith.constant 1 : i1 + %res = arith.cmpi eq, %arg0, %one : i1 + return %res : i1 +} + +// CHECK-LABEL: @cmpiI1eqVec +// CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>) +// CHECK: return %[[ARG]] +func.func @cmpiI1eqVec(%arg0: vector<4xi1>) -> vector<4xi1> { + %one = arith.constant dense<1> : vector<4xi1> + %res = arith.cmpi eq, %arg0, %one : vector<4xi1> + return %res : vector<4xi1> +} + +// CHECK-LABEL: @cmpiI1ne +// CHECK-SAME: (%[[ARG:.*]]: i1) +// CHECK: return %[[ARG]] +func.func @cmpiI1ne(%arg0: i1) -> i1 { + %zero = arith.constant 0 : i1 + %res = arith.cmpi ne, %arg0, %zero : i1 + return %res : i1 +} + +// CHECK-LABEL: @cmpiI1neVec +// CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>) +// CHECK: return %[[ARG]] +func.func @cmpiI1neVec(%arg0: vector<4xi1>) -> vector<4xi1> { + %zero = arith.constant dense<0> : vector<4xi1> + %res = arith.cmpi ne, %arg0, %zero : vector<4xi1> + return %res : vector<4xi1> +} + +// CHECK-LABEL: @cmpiI1eqLhs +// CHECK-SAME: (%[[ARG:.*]]: i1) +// CHECK: return %[[ARG]] +func.func @cmpiI1eqLhs(%arg0: i1) -> i1 { + %one = arith.constant 1 : i1 + %res = arith.cmpi eq, %one, %arg0 : i1 + return %res : i1 +} + +// CHECK-LABEL: @cmpiI1eqVecLhs +// CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>) +// CHECK: return %[[ARG]] +func.func @cmpiI1eqVecLhs(%arg0: vector<4xi1>) -> vector<4xi1> { + %one = arith.constant dense<1> : vector<4xi1> + %res = arith.cmpi eq, %one, %arg0 : vector<4xi1> + return %res : vector<4xi1> +} + +// CHECK-LABEL: @cmpiI1neLhs +// CHECK-SAME: (%[[ARG:.*]]: i1) +// CHECK: return %[[ARG]] +func.func @cmpiI1neLhs(%arg0: i1) -> i1 { + %zero = arith.constant 0 : i1 + %res = arith.cmpi ne, %zero, %arg0 : i1 + return %res : i1 +} + +// CHECK-LABEL: @cmpiI1neVecLhs +// CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>) +// CHECK: return %[[ARG]] +func.func @cmpiI1neVecLhs(%arg0: vector<4xi1>) -> vector<4xi1> { + %zero = arith.constant dense<0> : vector<4xi1> + %res = arith.cmpi ne, %zero, %arg0 : vector<4xi1> + return %res : vector<4xi1> +} + // Test case: Folding of comparisons with equal operands. // CHECK-LABEL: @cmpi_equal_operands // CHECK-DAG: %[[T:.*]] = arith.constant true From ddbfe6f7d2075a828fa9e8e5f5734bf881cda13a Mon Sep 17 00:00:00 2001 From: Robert Dazi <14996868+v01dXYZ@users.noreply.github.com> Date: Mon, 27 Jan 2025 12:43:37 +0100 Subject: [PATCH 176/432] [Sema] Fix __array_rank instantiation (#124491) The type being queried was left as a template type parameter, making the whole expression as dependent and thus not eligible to static_assert. Fixes #123498 Co-authored-by: v01dxyz Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/AST/ExprCXX.h | 4 +- clang/lib/Sema/TreeTransform.h | 3 - .../array-type-trait-with-template.cpp | 129 ++++++++++++++++++ 4 files changed, 133 insertions(+), 5 deletions(-) create mode 100644 clang/test/SemaCXX/array-type-trait-with-template.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b63bd366cfe88..c60565a568234 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1002,6 +1002,8 @@ Bug Fixes to C++ Support - Fixed assertions or false compiler diagnostics in the case of C++ modules for lambda functions or inline friend functions defined inside templates (#GH122493). - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423) +- Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a + template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498) - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234) Bug Fixes to AST Handling diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index aa10945addf78..2a130bc6da79a 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -2847,8 +2847,8 @@ class TypeTraitExpr final /// /// Example: /// \code -/// __array_rank(int[10][20]) == 2 -/// __array_extent(int, 1) == 20 +/// __array_rank(int[10][20]) == 2 +/// __array_extent(int[10][20], 1) == 20 /// \endcode class ArrayTypeTraitExpr : public Expr { /// The trait. An ArrayTypeTrait enum in MSVC compat unsigned. diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 12680843a434a..f04adf7fdf8ad 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -14947,9 +14947,6 @@ TreeTransform::TransformArrayTypeTraitExpr(ArrayTypeTraitExpr *E) { SubExpr = getDerived().TransformExpr(E->getDimensionExpression()); if (SubExpr.isInvalid()) return ExprError(); - - if (!getDerived().AlwaysRebuild() && SubExpr.get() == E->getDimensionExpression()) - return E; } return getDerived().RebuildArrayTypeTrait(E->getTrait(), E->getBeginLoc(), T, diff --git a/clang/test/SemaCXX/array-type-trait-with-template.cpp b/clang/test/SemaCXX/array-type-trait-with-template.cpp new file mode 100644 index 0000000000000..942714ec5d55a --- /dev/null +++ b/clang/test/SemaCXX/array-type-trait-with-template.cpp @@ -0,0 +1,129 @@ +// RUN: %clang_cc1 -fsyntax-only %s +// RUN: %clang_cc1 -fsyntax-only -std=c++20 -DWITH_AUTO_FUNCTION_PARAMETER=1 %s + +// When __array_rank is used with a template type parameter, this test +// ensures clang considers the final expression could be used with +// static_assert/constexpr. +// +// Although array_extent was handled well, we add it as a precaution. + +template +using remove_reference_t = __remove_reference_t(T); + +template +constexpr int array_rank(T (&lhs)[N]) { + return __array_rank(T[N]); +} + +template + constexpr int array_extent(T (&lhs)[N]) { + return __array_extent(T[N], I); +} + +template +struct Rank { + using ArrayT = remove_reference_t; + + template + static constexpr int call(ArrayT (&lhs)[N]) { + return __array_rank(ArrayT[N]); + } +}; + +template +struct Extent { + using ArrayT = remove_reference_t; + + template + static constexpr int call(ArrayT (&lhs)[N]) { + return __array_extent(ArrayT[N], I); + } +}; + +#ifdef WITH_AUTO_FUNCTION_PARAMETER +template +constexpr int array_rank_auto(auto (&lhs)[N]) { + return __array_rank(remove_reference_t[N]); +} + +template +constexpr int array_extent_auto(auto (&lhs)[N]) { + return __array_extent(remove_reference_t[N], I); +} +#endif + +template +constexpr int array_rank_int(const int (&lhs)[N]) { + return __array_rank(const int[N]); +} + +template +constexpr int array_extent_int(const int (&lhs)[N]) { + return __array_extent(const int[N], I); +} + +template +constexpr int array_rank_int(const int (&lhs)[M][N]) { + return __array_rank(const int[M][N]); +} + +template +constexpr int array_extent_int(const int (&lhs)[M][N]) { + return __array_extent(const int[M][N], I); +} + +int main() { + constexpr int vec[] = {0, 1, 2, 1}; + constexpr int mat[4][4] = { + {1, 0, 0, 0}, + {0, 1, 0, 0}, + {0, 0, 1, 0}, + {0, 0, 0, 1} + }; + +#define ATT_TESTS_WITH_ASSERT(ATT_ASSERT) \ + { ATT_ASSERT(RANK(vec) == 1); } \ + { ATT_ASSERT(RANK(mat) == 2); } \ + { ATT_ASSERT(EXTENT(vec, 0) == 4); } \ + { ATT_ASSERT(EXTENT(vec, 1) == 0); } \ + { ATT_ASSERT(EXTENT(mat, 1) == 4); } + +#define ATT_TESTS() \ + ATT_TESTS_WITH_ASSERT( constexpr bool cst = ) \ + ATT_TESTS_WITH_ASSERT( (void) ) \ + ATT_TESTS_WITH_ASSERT( static_assert ) + + { +#define RANK(lhs) array_rank(lhs) +#define EXTENT(lhs, i) array_extent(lhs) + ATT_TESTS(); +#undef RANK +#undef EXTENT + } + + { +#define RANK(lhs) Rank::call(lhs) +#define EXTENT(lhs, i) Extent::call(lhs) + ATT_TESTS(); +#undef RANK +#undef EXTENT + } + +#ifdef WITH_AUTO_FUNCTION_PARAMETER + { +#define RANK(lhs) array_rank_auto(lhs) +#define EXTENT(lhs, i) array_extent_auto(lhs) + ATT_TESTS(); +#undef RANK +#undef EXTENT + } +#endif + + { +#define RANK(lhs) array_rank_int(lhs) +#define EXTENT(lhs, i) array_extent_int(lhs) + ATT_TESTS(); +#undef RANK +#undef EXTENT + } +} From b7286dbef9dc1986860d29e390b092599e1d7db5 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 27 Jan 2025 11:59:38 +0000 Subject: [PATCH 177/432] Reland "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop #96752" (#123616) The last attempt failed a sanitiser build because we were creating a reference to a null Predicates pointer in isDereferenceableAndAlignedInLoop. This was exposed by the unit test IsDerefReadOnlyLoop in unittests/Analysis/LoadsTest.cpp. I fixed this by falling back on getConstantMaxBackedgeTakenCount if Predicates is null - see line 316 in llvm/lib/Analysis/Loads.cpp. There are no other changes. --- .../llvm/Analysis/LoopAccessAnalysis.h | 19 ++ llvm/lib/Analysis/Loads.cpp | 113 +++++---- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 61 ++--- .../LoopVectorize/X86/load-deref-pred.ll | 238 ++++++------------ .../LoopVectorize/load-deref-pred-align.ll | 27 +- 5 files changed, 191 insertions(+), 267 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 31374a128856c..6fc6ca14d0889 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -853,6 +853,25 @@ bool sortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType = true); +/// Calculate Start and End points of memory access. +/// Let's assume A is the first access and B is a memory access on N-th loop +/// iteration. Then B is calculated as: +/// B = A + Step*N . +/// Step value may be positive or negative. +/// N is a calculated back-edge taken count: +/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 +/// Start and End points are calculated in the following way: +/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, +/// where SizeOfElt is the size of single memory access in bytes. +/// +/// There is no conflict when the intervals are disjoint: +/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) +std::pair getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, + ScalarEvolution *SE, + DenseMap, + std::pair> *PointerBounds); + class LoopAccessInfoManager { /// The cache. DenseMap> LoopAccessInfoMap; diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 9279f19b72a3f..691d7e4a3edcf 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" @@ -277,84 +278,90 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { bool llvm::isDereferenceableAndAlignedInLoop( LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC, SmallVectorImpl *Predicates) { + const Align Alignment = LI->getAlign(); auto &DL = LI->getDataLayout(); Value *Ptr = LI->getPointerOperand(); - APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), DL.getTypeStoreSize(LI->getType()).getFixedValue()); - const Align Alignment = LI->getAlign(); - - Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt(); // If given a uniform (i.e. non-varying) address, see if we can prove the // access is safe within the loop w/o needing predication. if (L->isLoopInvariant(Ptr)) - return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL, - HeaderFirstNonPHI, AC, &DT); + return isDereferenceableAndAlignedPointer( + Ptr, Alignment, EltSize, DL, &*L->getHeader()->getFirstNonPHIIt(), AC, + &DT); + + const SCEV *PtrScev = SE.getSCEV(Ptr); + auto *AddRec = dyn_cast(PtrScev); - // Otherwise, check to see if we have a repeating access pattern where we can - // prove that all accesses are well aligned and dereferenceable. - auto *AddRec = dyn_cast(SE.getSCEV(Ptr)); + // Check to see if we have a repeating access pattern and it's possible + // to prove all accesses are well aligned. if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine()) return false; - auto* Step = dyn_cast(AddRec->getStepRecurrence(SE)); + + auto *Step = dyn_cast(AddRec->getStepRecurrence(SE)); if (!Step) return false; - auto TC = SE.getSmallConstantMaxTripCount(L, Predicates); - if (!TC) + // For the moment, restrict ourselves to the case where the access size is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (EltSize.urem(Alignment.value()) != 0) return false; // TODO: Handle overlapping accesses. - // We should be computing AccessSize as (TC - 1) * Step + EltSize. - if (EltSize.sgt(Step->getAPInt())) + if (EltSize.ugt(Step->getAPInt().abs())) + return false; + + const SCEV *MaxBECount = + Predicates ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates) + : SE.getConstantMaxBackedgeTakenCount(L); + if (isa(MaxBECount)) + return false; + + const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( + L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr); + if (isa(AccessStart) || + isa(AccessEnd)) return false; - // Compute the total access size for access patterns with unit stride and - // patterns with gaps. For patterns with unit stride, Step and EltSize are the - // same. - // For patterns with gaps (i.e. non unit stride), we are - // accessing EltSize bytes at every Step. - APInt AccessSize = TC * Step->getAPInt(); + // Try to get the access size. + const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); + APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff); - assert(SE.isLoopInvariant(AddRec->getStart(), L) && - "implied by addrec definition"); Value *Base = nullptr; - if (auto *StartS = dyn_cast(AddRec->getStart())) { - Base = StartS->getValue(); - } else if (auto *StartS = dyn_cast(AddRec->getStart())) { - // Handle (NewBase + offset) as start value. - const auto *Offset = dyn_cast(StartS->getOperand(0)); - const auto *NewBase = dyn_cast(StartS->getOperand(1)); - if (StartS->getNumOperands() == 2 && Offset && NewBase) { - // The following code below assumes the offset is unsigned, but GEP - // offsets are treated as signed so we can end up with a signed value - // here too. For example, suppose the initial PHI value is (i8 255), - // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). - if (Offset->getAPInt().isNegative()) - return false; + APInt AccessSize; + if (const SCEVUnknown *NewBase = dyn_cast(AccessStart)) { + Base = NewBase->getValue(); + AccessSize = MaxPtrDiff; + } else if (auto *MinAdd = dyn_cast(AccessStart)) { + if (MinAdd->getNumOperands() != 2) + return false; - // For the moment, restrict ourselves to the case where the offset is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (Offset->getAPInt().urem(Alignment.value()) != 0) - return false; - Base = NewBase->getValue(); - bool Overflow = false; - AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow); - if (Overflow) - return false; - } - } + const auto *Offset = dyn_cast(MinAdd->getOperand(0)); + const auto *NewBase = dyn_cast(MinAdd->getOperand(1)); + if (!Offset || !NewBase) + return false; - if (!Base) - return false; + // The following code below assumes the offset is unsigned, but GEP + // offsets are treated as signed so we can end up with a signed value + // here too. For example, suppose the initial PHI value is (i8 255), + // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). + if (Offset->getAPInt().isNegative()) + return false; - // For the moment, restrict ourselves to the case where the access size is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (EltSize.urem(Alignment.value()) != 0) + // For the moment, restrict ourselves to the case where the offset is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (Offset->getAPInt().urem(Alignment.value()) != 0) + return false; + + AccessSize = MaxPtrDiff + Offset->getAPInt(); + Base = NewBase->getValue(); + } else return false; + + Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL, HeaderFirstNonPHI, AC, &DT); } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 2a68979add666..11e0a221fc887 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -190,31 +190,20 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( Members.push_back(Index); } -/// Calculate Start and End points of memory access. -/// Let's assume A is the first access and B is a memory access on N-th loop -/// iteration. Then B is calculated as: -/// B = A + Step*N . -/// Step value may be positive or negative. -/// N is a calculated back-edge taken count: -/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 -/// Start and End points are calculated in the following way: -/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, -/// where SizeOfElt is the size of single memory access in bytes. -/// -/// There is no conflict when the intervals are disjoint: -/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) -static std::pair getStartAndEndForAccess( - const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, - PredicatedScalarEvolution &PSE, +std::pair llvm::getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, + ScalarEvolution *SE, DenseMap, - std::pair> &PointerBounds) { - ScalarEvolution *SE = PSE.getSE(); - - auto [Iter, Ins] = PointerBounds.insert( - {{PtrExpr, AccessTy}, - {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); - if (!Ins) - return Iter->second; + std::pair> *PointerBounds) { + std::pair *PtrBoundsPair; + if (PointerBounds) { + auto [Iter, Ins] = PointerBounds->insert( + {{PtrExpr, AccessTy}, + {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); + if (!Ins) + return Iter->second; + PtrBoundsPair = &Iter->second; + } const SCEV *ScStart; const SCEV *ScEnd; @@ -222,10 +211,8 @@ static std::pair getStartAndEndForAccess( if (SE->isLoopInvariant(PtrExpr, Lp)) { ScStart = ScEnd = PtrExpr; } else if (auto *AR = dyn_cast(PtrExpr)) { - const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount(); - ScStart = AR->getStart(); - ScEnd = AR->evaluateAtIteration(Ex, *SE); + ScEnd = AR->evaluateAtIteration(MaxBECount, *SE); const SCEV *Step = AR->getStepRecurrence(*SE); // For expressions with negative step, the upper bound is ScStart and the @@ -244,7 +231,7 @@ static std::pair getStartAndEndForAccess( return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant"); - assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant"); + assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant"); // Add the size of the pointed element to ScEnd. auto &DL = Lp->getHeader()->getDataLayout(); @@ -252,8 +239,10 @@ static std::pair getStartAndEndForAccess( const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); - Iter->second = {ScStart, ScEnd}; - return Iter->second; + std::pair Res = {ScStart, ScEnd}; + if (PointerBounds) + *PtrBoundsPair = Res; + return Res; } /// Calculate Start and End points of memory access using @@ -263,8 +252,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze) { + const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( - Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds()); + Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds()); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -1938,10 +1928,11 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( // required for correctness. if (SE.isLoopInvariant(Src, InnermostLoop) || SE.isLoopInvariant(Sink, InnermostLoop)) { - const auto &[SrcStart_, SrcEnd_] = - getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds); - const auto &[SinkStart_, SinkEnd_] = - getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds); + const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); + const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( + InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds); + const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( + InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds); if (!isa(SrcStart_) && !isa(SrcEnd_) && !isa(SinkStart_) && diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 1433e48690bc6..3e50ee42866b9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -2920,8 +2920,8 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { -; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes( +define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { +; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [103 x i32], align 4 ; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) @@ -2929,11 +2929,11 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 @@ -2999,170 +2999,74 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] -; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] -; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] -; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] -; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] -; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] -; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] -; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] -; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] -; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] -; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] -; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] -; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] -; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] -; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] -; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] -; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] -; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] -; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] -; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] -; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] -; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] -; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] -; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] -; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] -; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] -; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] -; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 -; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP144]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP145]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] -; CHECK-NEXT: [[TMP146]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] -; CHECK-NEXT: [[TMP147]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4 +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 +; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 +; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 +; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 +; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP112]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP113]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] +; CHECK-NEXT: [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] +; CHECK-NEXT: [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 -; CHECK-NEXT: br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 +; CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP145]], [[TMP144]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]] +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]] +; CHECK-NEXT: [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -3181,7 +3085,7 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 0f4e327891899..cbc483fabc184 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -351,27 +351,30 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.if1: +; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1 ; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue2: +; CHECK: pred.store.continue4: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -665,12 +668,15 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 @@ -680,9 +686,6 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2 ; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 From b8d921003d1f20819b897b066e02d22787f11550 Mon Sep 17 00:00:00 2001 From: David Truby Date: Mon, 27 Jan 2025 12:23:03 +0000 Subject: [PATCH 178/432] [flang][NFC] Restrict -funroll-loops tests to known working targets (#123939) If -funroll-loops tests are not restricted to specific targets the tests may behave differently based on the host platform. This patch restricts the tests to aarch64 and x86_64, and removes the PowerPC XFAIL. --- flang/test/HLFIR/unroll-loops.fir | 12 ++++++++---- flang/test/Integration/unroll-loops.f90 | 15 ++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/flang/test/HLFIR/unroll-loops.fir b/flang/test/HLFIR/unroll-loops.fir index 4494cfa570dd7..1c214f76f5649 100644 --- a/flang/test/HLFIR/unroll-loops.fir +++ b/flang/test/HLFIR/unroll-loops.fir @@ -1,7 +1,11 @@ -// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL // FIXME: https://github.com/llvm/llvm-project/issues/123668 // XFAIL: target=powerpc64{{.*}} diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90 index 4b4a394502881..86c57dd2fd0ea 100644 --- a/flang/test/Integration/unroll-loops.f90 +++ b/flang/test/Integration/unroll-loops.f90 @@ -1,10 +1,11 @@ -! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL - -! FIXME: https://github.com/llvm/llvm-project/issues/123668 -! XFAIL: target=powerpc64{{.*}} +! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL ! CHECK-LABEL: @unroll ! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]]) From 98e52db4a5e57f919bb70312f9ca7deb16ee6fcb Mon Sep 17 00:00:00 2001 From: David Truby Date: Mon, 27 Jan 2025 12:37:58 +0000 Subject: [PATCH 179/432] Revert "[flang][NFC] Restrict -funroll-loops tests to known working targets" (#124536) Reverts llvm/llvm-project#123939 --- flang/test/HLFIR/unroll-loops.fir | 12 ++++-------- flang/test/Integration/unroll-loops.f90 | 15 +++++++-------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/flang/test/HLFIR/unroll-loops.fir b/flang/test/HLFIR/unroll-loops.fir index 1c214f76f5649..4494cfa570dd7 100644 --- a/flang/test/HLFIR/unroll-loops.fir +++ b/flang/test/HLFIR/unroll-loops.fir @@ -1,11 +1,7 @@ -// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL // FIXME: https://github.com/llvm/llvm-project/issues/123668 // XFAIL: target=powerpc64{{.*}} diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90 index 86c57dd2fd0ea..4b4a394502881 100644 --- a/flang/test/Integration/unroll-loops.f90 +++ b/flang/test/Integration/unroll-loops.f90 @@ -1,11 +1,10 @@ -! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL -! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL +! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL + +! FIXME: https://github.com/llvm/llvm-project/issues/123668 +! XFAIL: target=powerpc64{{.*}} ! CHECK-LABEL: @unroll ! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]]) From 6087c3049656bbaef51fffb48e2404e86f7e0d3f Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 27 Jan 2025 13:41:18 +0100 Subject: [PATCH 180/432] [lldb] Simplify preprocessor conditional (#124522) The long list of defines is just a very elaborate way to say "not windows". --- lldb/source/Host/common/Host.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp index fdb623667bc25..d5054008268b9 100644 --- a/lldb/source/Host/common/Host.cpp +++ b/lldb/source/Host/common/Host.cpp @@ -11,13 +11,19 @@ #include #include #include + #ifndef _WIN32 #include #include #include #include #include +#include +#include #include +#if !defined(__ANDROID__) +#include +#endif #endif #if defined(__APPLE__) @@ -26,16 +32,6 @@ #include #endif -#if defined(__linux__) || defined(__FreeBSD__) || \ - defined(__FreeBSD_kernel__) || defined(__APPLE__) || \ - defined(__NetBSD__) || defined(__OpenBSD__) || defined(__EMSCRIPTEN__) -#if !defined(__ANDROID__) -#include -#endif -#include -#include -#endif - #if defined(__FreeBSD__) #include #endif From cfdd7d736a94aa65a23eb41258d9d6712cdb2b0d Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Mon, 27 Jan 2025 12:50:10 +0000 Subject: [PATCH 181/432] [compiler-rt][rtsan] sched cpu affinity for linux interception. (#124194) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 22 +++++++++++++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 16 ++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 1b499f2194f21..7d8b1c84f7d1c 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -740,6 +740,26 @@ INTERCEPTOR(int, sched_yield, void) { return REAL(sched_yield)(); } +#if SANITIZER_LINUX +INTERCEPTOR(int, sched_getaffinity, pid_t pid, size_t len, cpu_set_t *set) { + __rtsan_notify_intercepted_call("sched_getaffinity"); + return REAL(sched_getaffinity)(pid, len, set); +} + +INTERCEPTOR(int, sched_setaffinity, pid_t pid, size_t len, + const cpu_set_t *set) { + __rtsan_notify_intercepted_call("sched_setaffinity"); + return REAL(sched_setaffinity)(pid, len, set); +} +#define RTSAN_MAYBE_INTERCEPT_SCHED_GETAFFINITY \ + INTERCEPT_FUNCTION(sched_getaffinity) +#define RTSAN_MAYBE_INTERCEPT_SCHED_SETAFFINITY \ + INTERCEPT_FUNCTION(sched_setaffinity) +#else +#define RTSAN_MAYBE_INTERCEPT_SCHED_GETAFFINITY +#define RTSAN_MAYBE_INTERCEPT_SCHED_SETAFFINITY +#endif + // Memory INTERCEPTOR(void *, calloc, SIZE_T num, SIZE_T size) { @@ -1415,6 +1435,8 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(usleep); INTERCEPT_FUNCTION(nanosleep); INTERCEPT_FUNCTION(sched_yield); + RTSAN_MAYBE_INTERCEPT_SCHED_GETAFFINITY; + RTSAN_MAYBE_INTERCEPT_SCHED_SETAFFINITY; INTERCEPT_FUNCTION(accept); INTERCEPT_FUNCTION(bind); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index a4f2b92b7c494..ef9ec626610d5 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -323,6 +323,22 @@ TEST(TestRtsanInterceptors, SchedYieldDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +#if SANITIZER_LINUX +TEST(TestRtsanInterceptors, SchedGetaffinityDiesWhenRealtime) { + cpu_set_t set{}; + auto Func = [&set]() { sched_getaffinity(0, sizeof(set), &set); }; + ExpectRealtimeDeath(Func, "sched_getaffinity"); + ExpectNonRealtimeSurvival(Func); +} + +TEST(TestRtsanInterceptors, SchedSetaffinityDiesWhenRealtime) { + cpu_set_t set{}; + auto Func = [&set]() { sched_setaffinity(0, sizeof(set), &set); }; + ExpectRealtimeDeath(Func, "sched_setaffinity"); + ExpectNonRealtimeSurvival(Func); +} +#endif + /* Filesystem */ From e21b80464a44ef6491e44517ac59892c10ba2d6c Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Mon, 27 Jan 2025 12:52:35 +0000 Subject: [PATCH 182/432] [compiler-rt][rtsan] socketpair interception. (#124107) --- compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp | 6 ++++++ .../lib/rtsan/tests/rtsan_test_interceptors_posix.cpp | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 7d8b1c84f7d1c..6816119065263 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -1088,6 +1088,11 @@ INTERCEPTOR(int, setsockopt, int socket, int level, int option, #define RTSAN_MAYBE_INTERCEPT_SETSOCKOPT #endif +INTERCEPTOR(int, socketpair, int domain, int type, int protocol, int pair[2]) { + __rtsan_notify_intercepted_call("socketpair"); + return REAL(socketpair)(domain, type, protocol, pair); +} + // I/O Multiplexing INTERCEPTOR(int, poll, struct pollfd *fds, nfds_t nfds, int timeout) { @@ -1459,6 +1464,7 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_GETPEERNAME; RTSAN_MAYBE_INTERCEPT_GETSOCKOPT; RTSAN_MAYBE_INTERCEPT_SETSOCKOPT; + INTERCEPT_FUNCTION(socketpair); RTSAN_MAYBE_INTERCEPT_SELECT; INTERCEPT_FUNCTION(pselect); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index ef9ec626610d5..59663776366bb 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -1351,6 +1351,13 @@ TEST(TestRtsanInterceptors, SetsockoptOnASocketDiesWhenRealtime) { } #endif +TEST(TestRtsanInterceptors, SocketpairDiesWhenRealtime) { + int pair[2]{}; + auto Func = [&pair]() { socketpair(0, 0, 0, pair); }; + ExpectRealtimeDeath(Func, "socketpair"); + ExpectNonRealtimeSurvival(Func); +} + /* I/O Multiplexing */ From d8ad1eef8ffeb4ef5474f0e38d6d340d82c53572 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 27 Jan 2025 12:53:38 +0000 Subject: [PATCH 183/432] [AArch64] Generate zeroing forms of certain SVE2.2 instructions (7/11) (#116833) SVE2.2 introduces instructions with predicated forms with zeroing of the inactive lanes. This allows in some cases to save a `movprfx` or a `mov` instruction when emitting code for `_x` or `_z` variants of intrinsics. This patch adds support for emitting the zeroing forms of certain `FLOGB` instructions. --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 2 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 +- .../CodeGen/AArch64/zeroing-forms-flogb.ll | 258 ++++++++++++++++++ 3 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 9ed683e73e9cc..2d6a3b6199c67 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4280,7 +4280,7 @@ let Predicates = [HasSVE2p2_or_SME2p2] in { defm SCVTF_ZPzZ : sve_fp_z2op_p_zd_c<0b0, "scvtf", "int_aarch64_sve_scvtf", AArch64scvtf_mt>; defm UCVTF_ZPzZ : sve_fp_z2op_p_zd_c<0b1, "ucvtf", "int_aarch64_sve_ucvtf", AArch64ucvtf_mt>; // Signed integer base 2 logarithm of fp value, zeroing predicate - defm FLOGB_ZPzZ : sve_fp_z2op_p_zd_d_flogb<"flogb">; + defm FLOGB_ZPzZ : sve_fp_z2op_p_zd_d_flogb<"flogb", int_aarch64_sve_flogb>; // SVE2 integer unary operations, zeroing predicate def URECPE_ZPzZ : sve2_int_un_pred_arit_z<0b10, 0b00, "urecpe", ZPR32>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 8125014faa033..199b2e343d3f7 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3332,10 +3332,14 @@ multiclass sve_fp_z2op_p_zd_c(NAME # _DtoD)>; } -multiclass sve_fp_z2op_p_zd_d_flogb { +multiclass sve_fp_z2op_p_zd_d_flogb { def _H : sve_fp_z2op_p_zd<0b0011001, asm, ZPR16, ZPR16>; def _S : sve_fp_z2op_p_zd<0b0011010, asm, ZPR32, ZPR32>; def _D : sve_fp_z2op_p_zd<0b0011011, asm, ZPR64, ZPR64>; + + defm : SVE_3_Op_UndefZero_Pat(NAME # _H)>; + defm : SVE_3_Op_UndefZero_Pat(NAME # _S)>; + defm : SVE_3_Op_UndefZero_Pat(NAME # _D)>; } multiclass sve_fp_z2op_p_zd_b_0 { diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll new file mode 100644 index 0000000000000..23620a3419b99 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2 < %s | FileCheck %s +; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 + +; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 + +target triple = "aarch64-linux" + +define @test_svlogb_f16_x_1( %pg, %x) { +; CHECK-LABEL: test_svlogb_f16_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: flogb z0.h, p0/m, z0.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f16_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.h, p0/z, z0.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_f16_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svlogb_f16_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: flogb z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f16_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_f16_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svlogb_f16_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: flogb z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f16_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv8f16( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svlogb_f32_x_1( %pg, %x) { +; CHECK-LABEL: test_svlogb_f32_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: flogb z0.s, p0/m, z0.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f32_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.s, p0/z, z0.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_f32_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svlogb_f32_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: flogb z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f32_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_f32_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svlogb_f32_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: flogb z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f32_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv4f32( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svlogb_f64_x_1( %pg, %x) { +; CHECK-LABEL: test_svlogb_f64_x_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: flogb z0.d, p0/m, z0.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f64_x_1: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.d, p0/z, z0.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_f64_x_2( %pg, double %z0, %x) { +; CHECK-LABEL: test_svlogb_f64_x_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: flogb z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f64_x_2: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_f64_z( %pg, double %z0, %x) { +; CHECK-LABEL: test_svlogb_f64_z: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: flogb z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_f64_z: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: flogb z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.flogb.nxv2f64( zeroinitializer, %pg, %x) + ret %0 +} + +define @test_svlogb_nxv8f16_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svlogb_nxv8f16_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: flogb z0.h, p0/m, z1.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_nxv8f16_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: flogb z0.h, p0/z, z1.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.flogb.nxv8f16( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_nxv8f16_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svlogb_nxv8f16_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: flogb z0.h, p0/m, z2.h +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_nxv8f16_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.h +; CHECK-2p2-NEXT: flogb z0.h, p0/z, z2.h +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %0 = tail call @llvm.aarch64.sve.flogb.nxv8f16( %x, %pg, %y) + ret %0 +} + +define @test_svlogb_nxv4f32_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svlogb_nxv4f32_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: flogb z0.s, p0/m, z1.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_nxv4f32_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: flogb z0.s, p0/z, z1.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.flogb.nxv4f32( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_nxv4f32_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svlogb_nxv4f32_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: flogb z0.s, p0/m, z2.s +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_nxv4f32_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.s +; CHECK-2p2-NEXT: flogb z0.s, p0/z, z2.s +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %0 = tail call @llvm.aarch64.sve.flogb.nxv4f32( %x, %pg, %y) + ret %0 +} + +define @test_svlogb_nxv2f64_ptrue_u(double %z0, %x) { +; CHECK-LABEL: test_svlogb_nxv2f64_ptrue_u: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: flogb z0.d, p0/m, z1.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_nxv2f64_ptrue_u: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: flogb z0.d, p0/z, z1.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.flogb.nxv2f64( poison, %pg, %x) + ret %0 +} + +define @test_svlogb_nxv2f64_ptrue(double %z0, %x, %y) { +; CHECK-LABEL: test_svlogb_nxv2f64_ptrue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: flogb z0.d, p0/m, z2.d +; CHECK-NEXT: ret +; +; CHECK-2p2-LABEL: test_svlogb_nxv2f64_ptrue: +; CHECK-2p2: // %bb.0: // %entry +; CHECK-2p2-NEXT: ptrue p0.d +; CHECK-2p2-NEXT: flogb z0.d, p0/z, z2.d +; CHECK-2p2-NEXT: ret +entry: + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %0 = tail call @llvm.aarch64.sve.flogb.nxv2f64( %x, %pg, %y) + ret %0 +} + From b31e9747d0866ff97a1cd4a608b7eade31c0aa0b Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 27 Jan 2025 13:06:33 +0000 Subject: [PATCH 184/432] [lldb][AArch64] Fix expression evaluation with Guarded Control Stacks (#123918) When the Guarded Control Stack (GCS) is enabled, returns cause the processor to validate that the address at the location pointed to by gcspr_el0 matches the one in the link register. ``` ret (lr=A) << pc | GCS | +=====+ | A | | B | << gcspr_el0 Fault: tried to return to A when you should have returned to B. ``` Therefore when an expression wrapper function tries to return to the expression return address (usually `_start` if there is a libc), it would fault. ``` ret (lr=_start) << pc | GCS | +============+ | user_func1 | | user_func2 | << gcspr_el0 Fault: tried to return to _start when you should have returned to user_func2. ``` To fix this we must push that return address to the GCS in PrepareTrivialCall. This value is then consumed by the final return and the expression completes as expected. If for some reason that fails, we will manually restore the value of gcspr_el0, because it turns out that PrepareTrivialCall does not restore registers if it fails at all. So for now I am handling gcspr_el0 specifically, but I have filed https://github.com/llvm/llvm-project/issues/124269 to address the general problem. (the other things PrepareTrivialCall does are exceedingly likely to not fail, so we have never noticed this) ``` ret (lr=_start) << pc | GCS | +============+ | user_func1 | | user_func2 | | _start | << gcspr_el0 No fault, we return to _start as normal. ``` The gcspr_el0 register will be restored after expression evaluation so that the program can continue correctly. However, due to restrictions in the Linux GCS ABI, we will not restore the enable bit of gcs_features_enabled. Re-enabling GCS via ptrace is not supported because it requires memory to be allocated by the kernel. We could disable GCS if the expression enabled GCS, however this would use up that state transition that the program might later rely on. And generally it is cleaner to ignore the enable bit, rather than one state transition of it. We will also not restore the GCS entry that was overwritten with the expression's return address. On the grounds that: * This entry will never be used by the program. If the program branches, the entry will be overwritten. If the program returns, gcspr_el0 will point to the entry before the expression return address and that entry will instead be validated. * Any expression that calls functions will overwrite even more entries, so the user needs to be aware of that anyway if they want to preserve the contents of the GCS for inspection. * An expression could leave the program in a state where restoring the value makes the situation worse. Especially if we ever support this in bare metal debugging. I will later document all this on https://lldb.llvm.org/use/aarch64-linux.html. Tests have been added for: * A function call that does not interact with GCS. * A call that does, and disables it (we do not re-enable it). * A call that does, and enables it (we do not disable it again). * Failure to push an entry to the GCS stack. --- .../Plugins/ABI/AArch64/ABISysV_arm64.cpp | 75 +++++++ .../NativeRegisterContextLinux_arm64.cpp | 20 +- .../linux/aarch64/gcs/TestAArch64LinuxGCS.py | 211 +++++++++++++++--- lldb/test/API/linux/aarch64/gcs/main.c | 47 +++- 4 files changed, 311 insertions(+), 42 deletions(-) diff --git a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp index 93b8141e97ef8..74047ea65788c 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp @@ -60,6 +60,69 @@ ABISysV_arm64::CreateInstance(lldb::ProcessSP process_sp, const ArchSpec &arch) return ABISP(); } +static Status PushToLinuxGuardedControlStack(addr_t return_addr, + RegisterContext *reg_ctx, + Thread &thread) { + Status err; + + // If the Guarded Control Stack extension is present we may need to put the + // return address onto that stack. + const RegisterInfo *gcs_features_enabled_info = + reg_ctx->GetRegisterInfoByName("gcs_features_enabled"); + if (!gcs_features_enabled_info) + return err; + + uint64_t gcs_features_enabled = reg_ctx->ReadRegisterAsUnsigned( + gcs_features_enabled_info, LLDB_INVALID_ADDRESS); + if (gcs_features_enabled == LLDB_INVALID_ADDRESS) + return Status("Could not read GCS features enabled register."); + + // Only attempt this if GCS is enabled. If it's not enabled then gcspr_el0 + // may point to unmapped memory. + if ((gcs_features_enabled & 1) == 0) + return err; + + const RegisterInfo *gcspr_el0_info = + reg_ctx->GetRegisterInfoByName("gcspr_el0"); + if (!gcspr_el0_info) + return Status("Could not get register info for gcspr_el0."); + + uint64_t gcspr_el0 = + reg_ctx->ReadRegisterAsUnsigned(gcspr_el0_info, LLDB_INVALID_ADDRESS); + if (gcspr_el0 == LLDB_INVALID_ADDRESS) + return Status("Could not read gcspr_el0."); + + // A link register entry on the GCS is 8 bytes. + gcspr_el0 -= 8; + if (!reg_ctx->WriteRegisterFromUnsigned(gcspr_el0_info, gcspr_el0)) + return Status( + "Attempted to decrement gcspr_el0, but could not write to it."); + + Status error; + size_t wrote = thread.GetProcess()->WriteMemory(gcspr_el0, &return_addr, + sizeof(return_addr), error); + if ((wrote != sizeof(return_addr) || error.Fail())) { + // When PrepareTrivialCall fails, the register context is not restored, + // unlike when an expression fails to execute. This is arguably a bug, + // see https://github.com/llvm/llvm-project/issues/124269. + // For now we are handling this here specifically. We can assume this + // write will work as the one to decrement the register did. + reg_ctx->WriteRegisterFromUnsigned(gcspr_el0_info, gcspr_el0 + 8); + return Status("Failed to write new Guarded Control Stack entry."); + } + + Log *log = GetLog(LLDBLog::Expressions); + LLDB_LOGF(log, + "Pushed return address 0x%" PRIx64 " to Guarded Control Stack. " + "gcspr_el0 was 0%" PRIx64 ", is now 0x%" PRIx64 ".", + return_addr, gcspr_el0 - 8, gcspr_el0); + + // gcspr_el0 will be restored to the original value by lldb-server after + // the call has finished, which serves as the "pop". + + return err; +} + bool ABISysV_arm64::PrepareTrivialCall(Thread &thread, addr_t sp, addr_t func_addr, addr_t return_addr, llvm::ArrayRef args) const { @@ -87,6 +150,18 @@ bool ABISysV_arm64::PrepareTrivialCall(Thread &thread, addr_t sp, if (args.size() > 8) return false; + // Do this first, as it's got the most chance of failing (though still very + // low). + if (GetProcessSP()->GetTarget().GetArchitecture().GetTriple().isOSLinux()) { + Status err = PushToLinuxGuardedControlStack(return_addr, reg_ctx, thread); + // If we could not manage the GCS, the expression will certainly fail, + // and if we just carried on, that failure would be a lot more cryptic. + if (err.Fail()) { + LLDB_LOGF(log, "Failed to setup Guarded Call Stack: %s", err.AsCString()); + return false; + } + } + for (size_t i = 0; i < args.size(); ++i) { const RegisterInfo *reg_info = reg_ctx->GetRegisterInfo( eRegisterKindGeneric, LLDB_REGNUM_GENERIC_ARG1 + i); diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp index efd3385c46e92..884c7d4b9e359 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp @@ -1063,9 +1063,27 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues( std::bind(&NativeRegisterContextLinux_arm64::WriteFPMR, this)); break; case RegisterSetType::GCS: + // It is not permitted to enable GCS via ptrace. We can disable it, but + // to keep things simple we will not revert any change to the + // PR_SHADOW_STACK_ENABLE bit. Instead patch in the current enable bit + // into the registers we are about to restore. + m_gcs_is_valid = false; + error = ReadGCS(); + if (error.Fail()) + return error; + + uint64_t enable_bit = m_gcs_regs.features_enabled & 1UL; + gcs_regs new_gcs_regs = *reinterpret_cast(src); + new_gcs_regs.features_enabled = + (new_gcs_regs.features_enabled & ~1UL) | enable_bit; + + const uint8_t *new_gcs_src = + reinterpret_cast(&new_gcs_regs); error = RestoreRegisters( - GetGCSBuffer(), &src, GetGCSBufferSize(), m_gcs_is_valid, + GetGCSBuffer(), &new_gcs_src, GetGCSBufferSize(), m_gcs_is_valid, std::bind(&NativeRegisterContextLinux_arm64::WriteGCS, this)); + src += GetGCSBufferSize(); + break; } diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py index d3d4dbecf4a2a..fd46a42b3c69f 100644 --- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py +++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py @@ -3,7 +3,6 @@ extension is enabled. """ - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -84,6 +83,40 @@ def test_gcs_fault(self): ], ) + def check_gcs_registers( + self, + expected_gcs_features_enabled=None, + expected_gcs_features_locked=None, + expected_gcspr_el0=None, + ): + thread = self.dbg.GetSelectedTarget().process.GetThreadAtIndex(0) + registerSets = thread.GetFrameAtIndex(0).GetRegisters() + gcs_registers = registerSets.GetFirstValueByName( + r"Guarded Control Stack Registers" + ) + + gcs_features_enabled = gcs_registers.GetChildMemberWithName( + "gcs_features_enabled" + ).GetValueAsUnsigned() + if expected_gcs_features_enabled is not None: + self.assertEqual(expected_gcs_features_enabled, gcs_features_enabled) + + gcs_features_locked = gcs_registers.GetChildMemberWithName( + "gcs_features_locked" + ).GetValueAsUnsigned() + if expected_gcs_features_locked is not None: + self.assertEqual(expected_gcs_features_locked, gcs_features_locked) + + gcspr_el0 = gcs_registers.GetChildMemberWithName( + "gcspr_el0" + ).GetValueAsUnsigned() + if expected_gcspr_el0 is not None: + self.assertEqual(expected_gcspr_el0, gcspr_el0) + + return gcs_features_enabled, gcs_features_locked, gcspr_el0 + + # This helper reads all the GCS registers and optionally compares them + # against a previous state, then returns the current register values. @skipUnlessArch("aarch64") @skipUnlessPlatform(["linux"]) def test_gcs_registers(self): @@ -108,40 +141,7 @@ def test_gcs_registers(self): self.expect("register read --all", substrs=["Guarded Control Stack Registers:"]) - # This helper reads all the GCS registers and optionally compares them - # against a previous state, then returns the current register values. - def check_gcs_registers( - expected_gcs_features_enabled=None, - expected_gcs_features_locked=None, - expected_gcspr_el0=None, - ): - thread = self.dbg.GetSelectedTarget().process.GetThreadAtIndex(0) - registerSets = thread.GetFrameAtIndex(0).GetRegisters() - gcs_registers = registerSets.GetFirstValueByName( - r"Guarded Control Stack Registers" - ) - - gcs_features_enabled = gcs_registers.GetChildMemberWithName( - "gcs_features_enabled" - ).GetValueAsUnsigned() - if expected_gcs_features_enabled is not None: - self.assertEqual(expected_gcs_features_enabled, gcs_features_enabled) - - gcs_features_locked = gcs_registers.GetChildMemberWithName( - "gcs_features_locked" - ).GetValueAsUnsigned() - if expected_gcs_features_locked is not None: - self.assertEqual(expected_gcs_features_locked, gcs_features_locked) - - gcspr_el0 = gcs_registers.GetChildMemberWithName( - "gcspr_el0" - ).GetValueAsUnsigned() - if expected_gcspr_el0 is not None: - self.assertEqual(expected_gcspr_el0, gcspr_el0) - - return gcs_features_enabled, gcs_features_locked, gcspr_el0 - - enabled, locked, spr_el0 = check_gcs_registers() + enabled, locked, spr_el0 = self.check_gcs_registers() # Features enabled should have at least the enable bit set, it could have # others depending on what the C library did, but we can't rely on always @@ -164,7 +164,7 @@ def check_gcs_registers( substrs=["stopped", "stop reason = breakpoint"], ) - _, _, spr_el0 = check_gcs_registers(enabled, locked, spr_el0 - 8) + _, _, spr_el0 = self.check_gcs_registers(enabled, locked, spr_el0 - 8) # Any combination of GCS feature lock bits might have been set by the C # library, and could be set to 0 or 1. To check that we can modify them, @@ -235,3 +235,142 @@ def check_gcs_registers( "exited with status = 0", ], ) + + @skipUnlessPlatform(["linux"]) + def test_gcs_expression_simple(self): + if not self.isAArch64GCS(): + self.skipTest("Target must support GCS.") + + self.build() + self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) + + # Break before GCS has been enabled. + self.runCmd("b main") + # And after it has been enabled. + lldbutil.run_break_set_by_file_and_line( + self, + "main.c", + line_number("main.c", "// Set break point at this line."), + num_expected_locations=1, + ) + + self.runCmd("run", RUN_SUCCEEDED) + + if self.process().GetState() == lldb.eStateExited: + self.fail("Test program failed to run.") + + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + # GCS has not been enabled yet and the ABI plugin should know not to + # attempt pushing to the control stack. + before = self.check_gcs_registers() + expr_cmd = "p get_gcs_status()" + self.expect(expr_cmd, substrs=["(unsigned long) 0"]) + self.check_gcs_registers(*before) + + # Continue to when GCS has been enabled. + self.runCmd("continue") + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + # If we fail to setup the GCS entry, we should not leave any of the GCS registers + # changed. The last thing we do is write a new GCS entry to memory and + # to simulate the failure of that, temporarily point the GCS to the zero page. + # + # We use the value 8 here because LLDB will decrement it by 8 so it points to + # what we think will be an empty entry on the guarded control stack. + _, _, original_gcspr = self.check_gcs_registers() + self.runCmd("register write gcspr_el0 8") + before = self.check_gcs_registers() + self.expect(expr_cmd, error=True) + self.check_gcs_registers(*before) + # Point to the valid shadow stack region again. + self.runCmd(f"register write gcspr_el0 {original_gcspr}") + + # This time we do need to push to the GCS and having done so, we can + # return from this expression without causing a fault. + before = self.check_gcs_registers() + self.expect(expr_cmd, substrs=["(unsigned long) 1"]) + self.check_gcs_registers(*before) + + @skipUnlessPlatform(["linux"]) + def test_gcs_expression_disable_gcs(self): + if not self.isAArch64GCS(): + self.skipTest("Target must support GCS.") + + self.build() + self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) + + # Break after GCS is enabled. + lldbutil.run_break_set_by_file_and_line( + self, + "main.c", + line_number("main.c", "// Set break point at this line."), + num_expected_locations=1, + ) + + self.runCmd("run", RUN_SUCCEEDED) + + if self.process().GetState() == lldb.eStateExited: + self.fail("Test program failed to run.") + + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + # Unlock all features so the expression can enable them again. + self.runCmd("register write gcs_features_locked 0") + # Disable all features, but keep GCS itself enabled. + PR_SHADOW_STACK_ENABLE = 1 + self.runCmd(f"register write gcs_features_enabled 0x{PR_SHADOW_STACK_ENABLE:x}") + + enabled, locked, spr_el0 = self.check_gcs_registers() + # We restore everything apart GCS being enabled, as we are not allowed to + # go from disabled -> enabled via ptrace. + self.expect("p change_gcs_config(false)", substrs=["true"]) + enabled &= ~1 + self.check_gcs_registers(enabled, locked, spr_el0) + + @skipUnlessPlatform(["linux"]) + def test_gcs_expression_enable_gcs(self): + if not self.isAArch64GCS(): + self.skipTest("Target must support GCS.") + + self.build() + self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) + + # Break before GCS is enabled. + self.runCmd("b main") + + self.runCmd("run", RUN_SUCCEEDED) + + if self.process().GetState() == lldb.eStateExited: + self.fail("Test program failed to run.") + + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + # Unlock all features so the expression can enable them again. + self.runCmd("register write gcs_features_locked 0") + # Disable all features. The program needs PR_SHADOW_STACK_PUSH, but it + # will enable that itself. + self.runCmd(f"register write gcs_features_enabled 0") + + enabled, locked, spr_el0 = self.check_gcs_registers() + self.expect("p change_gcs_config(true)", substrs=["true"]) + # Though we could disable GCS with ptrace, we choose not to to be + # consistent with the disabled -> enabled behaviour. + enabled |= 1 + self.check_gcs_registers(enabled, locked, spr_el0) diff --git a/lldb/test/API/linux/aarch64/gcs/main.c b/lldb/test/API/linux/aarch64/gcs/main.c index 09354639af376..396aef7499ca9 100644 --- a/lldb/test/API/linux/aarch64/gcs/main.c +++ b/lldb/test/API/linux/aarch64/gcs/main.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -8,7 +9,12 @@ #define PR_GET_SHADOW_STACK_STATUS 74 #define PR_SET_SHADOW_STACK_STATUS 75 -#define PR_SHADOW_STACK_ENABLE (1UL) +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +#define PR_SHADOW_STACK_ENABLE (1UL << 0) +#define PR_SHADOW_STACK_WRITE (1UL << 1) +#define PR_SHADOW_STACK_PUSH (1UL << 2) + #define PRCTL_SYSCALL_NO 167 // Once we enable GCS, we cannot return from the function that made the syscall @@ -36,6 +42,36 @@ unsigned long get_gcs_status() { return mode; } +extern void _start(); +bool change_gcs_config(bool enable) { + // The test unlocks and disables all features (excluding the main enable bit) + // before calling this expression. Enable them again. + unsigned long new_status = + enable | PR_SHADOW_STACK_PUSH | PR_SHADOW_STACK_WRITE; + + if (enable) { + // We would not be able to return from prctl(). + my_prctl(PR_SET_SHADOW_STACK_STATUS, new_status, 0, 0, 0); + + // This is a stack, so we must push in reverse order to the pops we want to + // have later. So push the return of __lldb_expr (_start), then the return + // address of this function (__lldb_expr). + __asm__ __volatile__("sys #3, C7, C7, #0, %0\n" // gcspushm _start + "sys #3, C7, C7, #0, x30\n" // gcspushm x30 + : + : "r"(_start)); + } else { + if (prctl(PR_SET_SHADOW_STACK_STATUS, new_status, 0, 0, 0) != 0) + return false; + } + + // Turn back on all locks. + if (prctl(PR_LOCK_SHADOW_STACK_STATUS, ~(0UL), 0, 0, 0) != 0) + return false; + + return true; +} + void gcs_signal() { // If we enabled GCS manually, then we could just return from main to generate // a signal. However, if the C library enabled it, then we'd just exit @@ -50,10 +86,11 @@ void gcs_signal() { } // These functions are used to observe gcspr_el0 changing as we enter them, and -// the fault we cause by changing its value. -void test_func2() { volatile int i = 99; } +// the fault we cause by changing its value. Also used to check expression +// eval can handle function calls. +int test_func2() { return 99; } -void test_func() { test_func2(); } +int test_func() { return test_func2(); } int main() { if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) @@ -71,7 +108,7 @@ int main() { // By now we should have one memory region where the GCS is stored. // For register read/write tests. - test_func(); + volatile int i = test_func(); // If this was a register test, we would have disabled GCS during the // test_func call. We cannot re-enable it from ptrace so skip this part in From ef54e0bbfbef59932a59a1640f1f9e14b70cc41b Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 27 Jan 2025 13:12:11 +0000 Subject: [PATCH 185/432] [AArch64] Avoid generating LDAPUR on certain cores (#124274) On the CPUs listed below, we want to avoid LDAPUR for performance reasons. Add a tuning feature to disable them when using: -mcpu=neoverse-v2 -mcpu=neoverse-v3 -mcpu=cortex-x3 -mcpu=cortex-x4 -mcpu=cortex-x925 --- llvm/lib/Target/AArch64/AArch64Features.td | 6 +- .../lib/Target/AArch64/AArch64InstrAtomics.td | 4 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 + llvm/lib/Target/AArch64/AArch64Processors.td | 6 + .../Atomics/aarch64-atomic-load-rcpc_immo.ll | 144 ++++++++++++++---- 5 files changed, 127 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 0a91edb4c1661..20db70ee38572 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -805,10 +805,14 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly", "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">; -def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", +def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", "UseFixedOverScalableIfEqualCost", "true", "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">; +// For performance reasons we prefer to use ldapr to ldapur on certain cores. +def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true", + "Prefer add+ldapr to offset ldapur">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index de94cf64c9801..5e6db9d007a55 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -575,7 +575,7 @@ let Predicates = [HasRCPC3, HasNEON] in { } // v8.4a FEAT_LRCPC2 patterns -let Predicates = [HasRCPC_IMMO] in { +let Predicates = [HasRCPC_IMMO, UseLDAPUR] in { // Load-Acquire RCpc Register unscaled loads def : Pat<(acquiring_load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), @@ -589,7 +589,9 @@ let Predicates = [HasRCPC_IMMO] in { def : Pat<(acquiring_load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (LDAPURXi GPR64sp:$Rn, simm9:$offset)>; +} +let Predicates = [HasRCPC_IMMO] in { // Store-Release Register unscaled stores def : Pat<(releasing_store (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fa6385409f30c..9d0bd44544134 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -389,6 +389,8 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; +def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; + def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 0e3c4e8397f52..8a2c0442a0c0d 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -240,6 +240,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", @@ -250,6 +251,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily", @@ -260,6 +262,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily", FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", @@ -540,6 +543,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3", @@ -549,6 +553,7 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3 FeatureFuseAdrpAdd, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3", @@ -558,6 +563,7 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover FeatureFuseAdrpAdd, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll index 9687ba683fb7e..b475e68db411a 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll @@ -1,6 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "(?!^\s*lda.*\bsp\b)^\s*.*\bsp\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)" ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL -; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo,avoid-ldapur -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v2 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x4 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x925 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR define i8 @load_atomic_i8_aligned_unordered(ptr %ptr) { ; CHECK-LABEL: load_atomic_i8_aligned_unordered: @@ -39,8 +45,12 @@ define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_aligned_acquire: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -51,8 +61,12 @@ define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_aligned_acquire_const: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -113,8 +127,12 @@ define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) { ; GISEL: add x8, x0, #8 ; GISEL: ldaprh w0, [x8] ; -; SDAG-LABEL: load_atomic_i16_aligned_acquire: -; SDAG: ldapurh w0, [x0, #8] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapurh w0, [x0, #8] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #8 +; SDAG-AVOIDLDAPUR: ldaprh w0, [x8] %gep = getelementptr inbounds i16, ptr %ptr, i32 4 %r = load atomic i16, ptr %gep acquire, align 2 ret i16 %r @@ -125,8 +143,12 @@ define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) { ; GISEL: add x8, x0, #8 ; GISEL: ldaprh w0, [x8] ; -; SDAG-LABEL: load_atomic_i16_aligned_acquire_const: -; SDAG: ldapurh w0, [x0, #8] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapurh w0, [x0, #8] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #8 +; SDAG-AVOIDLDAPUR: ldaprh w0, [x8] %gep = getelementptr inbounds i16, ptr %ptr, i32 4 %r = load atomic i16, ptr %gep acquire, align 2 ret i16 %r @@ -183,16 +205,30 @@ define i32 @load_atomic_i32_aligned_monotonic_const(ptr readonly %ptr) { } define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i32_aligned_acquire: -; CHECK: ldapur w0, [x0, #16] +; GISEL-LABEL: load_atomic_i32_aligned_acquire: +; GISEL: ldapur w0, [x0, #16] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapur w0, [x0, #16] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #16 +; SDAG-AVOIDLDAPUR: ldapr w0, [x8] %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %r = load atomic i32, ptr %gep acquire, align 4 ret i32 %r } define i32 @load_atomic_i32_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i32_aligned_acquire_const: -; CHECK: ldapur w0, [x0, #16] +; GISEL-LABEL: load_atomic_i32_aligned_acquire_const: +; GISEL: ldapur w0, [x0, #16] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapur w0, [x0, #16] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #16 +; SDAG-AVOIDLDAPUR: ldapr w0, [x8] %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %r = load atomic i32, ptr %gep acquire, align 4 ret i32 %r @@ -249,16 +285,30 @@ define i64 @load_atomic_i64_aligned_monotonic_const(ptr readonly %ptr) { } define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i64_aligned_acquire: -; CHECK: ldapur x0, [x0, #32] +; GISEL-LABEL: load_atomic_i64_aligned_acquire: +; GISEL: ldapur x0, [x0, #32] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapur x0, [x0, #32] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #32 +; SDAG-AVOIDLDAPUR: ldapr x0, [x8] %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %r = load atomic i64, ptr %gep acquire, align 8 ret i64 %r } define i64 @load_atomic_i64_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i64_aligned_acquire_const: -; CHECK: ldapur x0, [x0, #32] +; GISEL-LABEL: load_atomic_i64_aligned_acquire_const: +; GISEL: ldapur x0, [x0, #32] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapur x0, [x0, #32] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #32 +; SDAG-AVOIDLDAPUR: ldapr x0, [x8] %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %r = load atomic i64, ptr %gep acquire, align 8 ret i64 %r @@ -387,8 +437,12 @@ define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_unaligned_acquire: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -399,8 +453,12 @@ define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_unaligned_acquire_const: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -846,9 +904,14 @@ define i8 @load_atomic_i8_from_gep() { ; GISEL: add x8, x8, #1 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_from_gep: -; SDAG: bl init -; SDAG: ldapurb w0, [sp, #13] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [sp, #13] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: orr x8, x19, #0x1 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %a = alloca [3 x i8] call void @init(ptr %a) %arrayidx = getelementptr [3 x i8], ptr %a, i64 0, i64 1 @@ -862,9 +925,14 @@ define i16 @load_atomic_i16_from_gep() { ; GISEL: add x8, x8, #2 ; GISEL: ldaprh w0, [x8] ; -; SDAG-LABEL: load_atomic_i16_from_gep: -; SDAG: bl init -; SDAG: ldapurh w0, [sp, #10] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapurh w0, [sp, #10] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: orr x8, x19, #0x2 +; SDAG-AVOIDLDAPUR: ldaprh w0, [x8] %a = alloca [3 x i16] call void @init(ptr %a) %arrayidx = getelementptr [3 x i16], ptr %a, i64 0, i64 1 @@ -877,9 +945,14 @@ define i32 @load_atomic_i32_from_gep() { ; GISEL: bl init ; GISEL: ldapur w0, [x8, #4] ; -; SDAG-LABEL: load_atomic_i32_from_gep: -; SDAG: bl init -; SDAG: ldapur w0, [sp, #8] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapur w0, [sp, #8] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: add x8, x19, #4 +; SDAG-AVOIDLDAPUR: ldapr w0, [x8] %a = alloca [3 x i32] call void @init(ptr %a) %arrayidx = getelementptr [3 x i32], ptr %a, i64 0, i64 1 @@ -892,9 +965,14 @@ define i64 @load_atomic_i64_from_gep() { ; GISEL: bl init ; GISEL: ldapur x0, [x8, #8] ; -; SDAG-LABEL: load_atomic_i64_from_gep: -; SDAG: bl init -; SDAG: ldapur x0, [sp, #16] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapur x0, [sp, #16] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: add x8, x19, #8 +; SDAG-AVOIDLDAPUR: ldapr x0, [x8] %a = alloca [3 x i64] call void @init(ptr %a) %arrayidx = getelementptr [3 x i64], ptr %a, i64 0, i64 1 From 347fb208c1e390a4f108e566efc81bd945837307 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 27 Jan 2025 13:25:37 +0000 Subject: [PATCH 186/432] [libclc] Optimize CLC vector relational builtins (#124537) Clang knows how to perform relational operations on OpenCL vectors, so we don't need to use the Clang builtins. The builtins we were using didn't support vector types, so we were previously scalarizing. This commit generates the same LLVM fcmp operations as before, just without the scalarization. --- .../clc/include/clc/relational/relational.h | 26 +++++++++++++ .../clc/lib/generic/relational/clc_isequal.cl | 38 ++++++------------- .../lib/generic/relational/clc_isgreater.cl | 23 +++-------- .../generic/relational/clc_isgreaterequal.cl | 30 ++++++--------- .../clc/lib/generic/relational/clc_isless.cl | 31 ++++++--------- .../lib/generic/relational/clc_islessequal.cl | 24 ++++-------- .../generic/relational/clc_islessgreater.cl | 27 +++++-------- .../lib/generic/relational/clc_isnotequal.cl | 21 ++++------ 8 files changed, 90 insertions(+), 130 deletions(-) diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h index 54241b6493c8e..f32e7630203e4 100644 --- a/libclc/clc/include/clc/relational/relational.h +++ b/libclc/clc/include/clc/relational/relational.h @@ -142,4 +142,30 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, \ ARG1_TYPE) +#define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \ + ARG1_TYPE, ARG2_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return _CLC_RELATIONAL_OP(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##2 FUNCTION(ARG1_TYPE##2 x, \ + ARG2_TYPE##2 y) { \ + return _CLC_RELATIONAL_OP(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##3 FUNCTION(ARG1_TYPE##3 x, \ + ARG2_TYPE##3 y) { \ + return _CLC_RELATIONAL_OP(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##4 FUNCTION(ARG1_TYPE##4 x, \ + ARG2_TYPE##4 y) { \ + return _CLC_RELATIONAL_OP(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##8 FUNCTION(ARG1_TYPE##8 x, \ + ARG2_TYPE##8 y) { \ + return _CLC_RELATIONAL_OP(x, y); \ + } \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##16 FUNCTION(ARG1_TYPE##16 x, \ + ARG2_TYPE##16 y) { \ + return _CLC_RELATIONAL_OP(x, y); \ + } + #endif // __CLC_RELATIONAL_RELATIONAL_H__ diff --git a/libclc/clc/lib/generic/relational/clc_isequal.cl b/libclc/clc/lib/generic/relational/clc_isequal.cl index 7664df7767cb3..053a237289fd6 100644 --- a/libclc/clc/lib/generic/relational/clc_isequal.cl +++ b/libclc/clc/lib/generic/relational/clc_isequal.cl @@ -1,44 +1,28 @@ #include +#include -#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ - return (x == y); \ - } +#define _CLC_RELATIONAL_OP(X, Y) (X) == (Y) -_CLC_DEFINE_ISEQUAL(int, __clc_isequal, float, float) -_CLC_DEFINE_ISEQUAL(int2, __clc_isequal, float2, float2) -_CLC_DEFINE_ISEQUAL(int3, __clc_isequal, float3, float3) -_CLC_DEFINE_ISEQUAL(int4, __clc_isequal, float4, float4) -_CLC_DEFINE_ISEQUAL(int8, __clc_isequal, float8, float8) -_CLC_DEFINE_ISEQUAL(int16, __clc_isequal, float16, float16) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isequal, float, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -// The scalar version of __clc_isequal(double) returns an int, but the vector -// versions return long. -_CLC_DEFINE_ISEQUAL(int, __clc_isequal, double, double) -_CLC_DEFINE_ISEQUAL(long2, __clc_isequal, double2, double2) -_CLC_DEFINE_ISEQUAL(long3, __clc_isequal, double3, double3) -_CLC_DEFINE_ISEQUAL(long4, __clc_isequal, double4, double4) -_CLC_DEFINE_ISEQUAL(long8, __clc_isequal, double8, double8) -_CLC_DEFINE_ISEQUAL(long16, __clc_isequal, double16, double16) +// The scalar version of __clc_isequal(double, double) returns an int, but the +// vector versions return long. +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isequal, double, double) #endif + #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -// The scalar version of __clc_isequal(half) returns an int, but the vector -// versions return short. -_CLC_DEFINE_ISEQUAL(int, __clc_isequal, half, half) -_CLC_DEFINE_ISEQUAL(short2, __clc_isequal, half2, half2) -_CLC_DEFINE_ISEQUAL(short3, __clc_isequal, half3, half3) -_CLC_DEFINE_ISEQUAL(short4, __clc_isequal, half4, half4) -_CLC_DEFINE_ISEQUAL(short8, __clc_isequal, half8, half8) -_CLC_DEFINE_ISEQUAL(short16, __clc_isequal, half16, half16) +// The scalar version of __clc_isequal(half, half) returns an int, but the +// vector versions return short. +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isequal, half, half) #endif -#undef _CLC_DEFINE_ISEQUAL +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_isgreater.cl b/libclc/clc/lib/generic/relational/clc_isgreater.cl index 39fb6b07fb185..ec14fa9a2ec08 100644 --- a/libclc/clc/lib/generic/relational/clc_isgreater.cl +++ b/libclc/clc/lib/generic/relational/clc_isgreater.cl @@ -1,12 +1,9 @@ #include #include -// Note: It would be nice to use __builtin_isgreater with vector inputs, but it -// seems to only take scalar values as input, which will produce incorrect -// output for vector input types. +#define _CLC_RELATIONAL_OP(X, Y) (X) > (Y) -_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float, - float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreater, float, float) #ifdef cl_khr_fp64 @@ -14,12 +11,7 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float, // The scalar version of __clc_isgreater(double, double) returns an int, but the // vector versions return long. - -_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(double x, double y) { - return __builtin_isgreater(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreater, double, double) #endif @@ -29,11 +21,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double) // The scalar version of __clc_isgreater(half, half) returns an int, but the // vector versions return short. - -_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(half x, half y) { - return __builtin_isgreater(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreater, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreater, half, half) #endif + +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl index ccf7c881a5549..e96f2325cbad4 100644 --- a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl +++ b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl @@ -1,12 +1,10 @@ #include #include -// Note: It would be nice to use __builtin_isgreaterequal with vector inputs, -// but it seems to only take scalar values as input, which will produce -// incorrect output for vector input types. +#define _CLC_RELATIONAL_OP(X, Y) (X) >= (Y) -_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal, - __builtin_isgreaterequal, float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreaterequal, float, + float) #ifdef cl_khr_fp64 @@ -14,26 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal, // The scalar version of __clc_isgreaterequal(double, double) returns an int, // but the vector versions return long. - -_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(double x, double y) { - return __builtin_isgreaterequal(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreaterequal, double, - double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreaterequal, double, + double) #endif + #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -// The scalar version of __clc_isgreaterequal(half, half) returns an int, but +// The scalar version of __clc_isgreaterequal(half, hafl) returns an int, but // the vector versions return short. - -_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(half x, half y) { - return __builtin_isgreaterequal(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreaterequal, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreaterequal, half, + half) #endif + +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_isless.cl b/libclc/clc/lib/generic/relational/clc_isless.cl index 1204a5057d864..0ce001d31d696 100644 --- a/libclc/clc/lib/generic/relational/clc_isless.cl +++ b/libclc/clc/lib/generic/relational/clc_isless.cl @@ -1,37 +1,28 @@ #include #include -// Note: It would be nice to use __builtin_isless with vector inputs, but it -// seems to only take scalar values as input, which will produce incorrect -// output for vector input types. +#define _CLC_RELATIONAL_OP(X, Y) (X) < (Y) -_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isless, __builtin_isless, float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isless, float, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -// The scalar version of __clc_isless(double, double) returns an int, but the -// vector versions return long. - -_CLC_DEF _CLC_OVERLOAD int __clc_isless(double x, double y) { - return __builtin_isless(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isless, double, double) +// The scalar version of __clc_isless(double, double) returns an int, but +// the vector versions return long. +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isless, double, double) #endif + #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -// The scalar version of __clc_isless(half, half) returns an int, but the vector -// versions return short. - -_CLC_DEF _CLC_OVERLOAD int __clc_isless(half x, half y) { - return __builtin_isless(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isless, half, half) +// The scalar version of __clc_isless(half, half) returns an int, but the +// vector versions return short. +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isless, half, half) #endif + +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_islessequal.cl b/libclc/clc/lib/generic/relational/clc_islessequal.cl index 6fde763263e2b..2d1d6d199fdab 100644 --- a/libclc/clc/lib/generic/relational/clc_islessequal.cl +++ b/libclc/clc/lib/generic/relational/clc_islessequal.cl @@ -1,12 +1,9 @@ #include #include -// Note: It would be nice to use __builtin_islessequal with vector inputs, but -// it seems to only take scalar values as input, which will produce incorrect -// output for vector input types. +#define _CLC_RELATIONAL_OP(X, Y) (X) <= (Y) -_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal, - float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessequal, float, float) #ifdef cl_khr_fp64 @@ -14,12 +11,8 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal, // The scalar version of __clc_islessequal(double, double) returns an int, but // the vector versions return long. - -_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(double x, double y) { - return __builtin_islessequal(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessequal, double, + double) #endif @@ -29,11 +22,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double) // The scalar version of __clc_islessequal(half, half) returns an int, but the // vector versions return short. - -_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(half x, half y) { - return __builtin_islessequal(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessequal, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessequal, half, half) #endif + +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_islessgreater.cl b/libclc/clc/lib/generic/relational/clc_islessgreater.cl index 5106c9f460e2c..3ca3c37731d15 100644 --- a/libclc/clc/lib/generic/relational/clc_islessgreater.cl +++ b/libclc/clc/lib/generic/relational/clc_islessgreater.cl @@ -1,12 +1,10 @@ #include #include -// Note: It would be nice to use __builtin_islessgreater with vector inputs, but -// it seems to only take scalar values as input, which will produce incorrect -// output for vector input types. +#define _CLC_RELATIONAL_OP(X, Y) ((X) < (Y)) || ((X) > (Y)) -_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater, - float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessgreater, float, + float) #ifdef cl_khr_fp64 @@ -14,25 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater, // The scalar version of __clc_islessgreater(double, double) returns an int, but // the vector versions return long. - -_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(double x, double y) { - return __builtin_islessgreater(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessgreater, double, double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessgreater, double, + double) #endif + #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable // The scalar version of __clc_islessgreater(half, half) returns an int, but the // vector versions return short. - -_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(half x, half y) { - return __builtin_islessgreater(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessgreater, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessgreater, half, + half) #endif + +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_isnotequal.cl b/libclc/clc/lib/generic/relational/clc_isnotequal.cl index 9f90713b2da50..d1ee4deab25c8 100644 --- a/libclc/clc/lib/generic/relational/clc_isnotequal.cl +++ b/libclc/clc/lib/generic/relational/clc_isnotequal.cl @@ -1,33 +1,28 @@ #include #include -#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ - return (x != y); \ - } +#define _CLC_RELATIONAL_OP(X, Y) (X) != (Y) -_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, float, float) -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isnotequal, float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isnotequal, float, float) #ifdef cl_khr_fp64 + #pragma OPENCL EXTENSION cl_khr_fp64 : enable // The scalar version of __clc_isnotequal(double, double) returns an int, but // the vector versions return long. - -_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, double, double) -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isnotequal, double, double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isnotequal, double, double) #endif + #ifdef cl_khr_fp16 + #pragma OPENCL EXTENSION cl_khr_fp16 : enable // The scalar version of __clc_isnotequal(half, half) returns an int, but the // vector versions return short. - -_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, half, half) -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isnotequal, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isnotequal, half, half) #endif -#undef _CLC_DEFINE_ISNOTEQUAL +#undef _CLC_RELATIONAL_OP From e9e06bea8661ddd474557a0db2cdc8770a55b66f Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 27 Jan 2025 13:27:31 +0000 Subject: [PATCH 187/432] [lldb][AArch64][NFC] Move a comment in GCS tests Got put in the wrong place during a rebase. --- lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py index fd46a42b3c69f..797551b061a23 100644 --- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py +++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py @@ -83,6 +83,8 @@ def test_gcs_fault(self): ], ) + # This helper reads all the GCS registers and optionally compares them + # against a previous state, then returns the current register values. def check_gcs_registers( self, expected_gcs_features_enabled=None, @@ -115,8 +117,6 @@ def check_gcs_registers( return gcs_features_enabled, gcs_features_locked, gcspr_el0 - # This helper reads all the GCS registers and optionally compares them - # against a previous state, then returns the current register values. @skipUnlessArch("aarch64") @skipUnlessPlatform(["linux"]) def test_gcs_registers(self): From d7e561b913d2a75c7c1807bf1c1e0bddc270a2b3 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Mon, 27 Jan 2025 15:02:38 +0100 Subject: [PATCH 188/432] [flang][OpenMP] Support `bind` clause code-gen for standalone `loop`s (#122674) Extends rewriting of `loop` directives by supporting `bind` clause for standalone directives. This follows both the spec and the current state of clang as follows: * No `bind` or `bind(thread)`: the `loop` is rewritten to `simd`. * `bind(parallel)`: the `loop` is rewritten to `do`. * `bind(teams)`: the `loop` is rewritten to `distribute`. This is a follow-up PR for https://github.com/llvm/llvm-project/pull/122632, only the latest commit in this PR is relevant to the PR. --- .../OpenMP/GenericLoopConversion.cpp | 86 +++++++++++++++---- flang/test/Lower/OpenMP/loop-directive.f90 | 42 ++++++++- 2 files changed, 109 insertions(+), 19 deletions(-) diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp index 555601c5e92df..c95d625d7240b 100644 --- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp @@ -53,7 +53,7 @@ class GenericLoopConversionPattern switch (combinedInfo) { case GenericLoopCombinedInfo::Standalone: - rewriteToSimdLoop(loopOp, rewriter); + rewriteStandaloneLoop(loopOp, rewriter); break; case GenericLoopCombinedInfo::TargetParallelLoop: llvm_unreachable("not yet implemented: `parallel loop` direcitve"); @@ -87,7 +87,10 @@ class GenericLoopConversionPattern << loopOp->getName() << " operation"; }; - if (loopOp.getBindKind()) + // For standalone directives, `bind` is already supported. Other combined + // forms will be supported in a follow-up PR. + if (combinedInfo != GenericLoopCombinedInfo::Standalone && + loopOp.getBindKind()) return todo("bind"); if (loopOp.getOrder()) @@ -119,7 +122,27 @@ class GenericLoopConversionPattern return result; } - /// Rewrites standalone `loop` directives to equivalent `simd` constructs. + void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp, + mlir::ConversionPatternRewriter &rewriter) const { + using namespace mlir::omp; + std::optional bindKind = loopOp.getBindKind(); + + if (!bindKind.has_value()) + return rewriteToSimdLoop(loopOp, rewriter); + + switch (*loopOp.getBindKind()) { + case ClauseBindKind::Parallel: + return rewriteToWsloop(loopOp, rewriter); + case ClauseBindKind::Teams: + return rewriteToDistrbute(loopOp, rewriter); + case ClauseBindKind::Thread: + return rewriteToSimdLoop(loopOp, rewriter); + } + } + + /// Rewrites standalone `loop` (without `bind` clause or with + /// `bind(parallel)`) directives to equivalent `simd` constructs. + /// /// The reasoning behind this decision is that according to the spec (version /// 5.2, section 11.7.1): /// @@ -147,30 +170,57 @@ class GenericLoopConversionPattern /// the directive. void rewriteToSimdLoop(mlir::omp::LoopOp loopOp, mlir::ConversionPatternRewriter &rewriter) const { - loopOp.emitWarning("Detected standalone OpenMP `loop` directive, the " - "associated loop will be rewritten to `simd`."); - mlir::omp::SimdOperands simdClauseOps; - simdClauseOps.privateVars = loopOp.getPrivateVars(); + loopOp.emitWarning( + "Detected standalone OpenMP `loop` directive with thread binding, " + "the associated loop will be rewritten to `simd`."); + rewriteToSingleWrapperOp( + loopOp, rewriter); + } + + void rewriteToDistrbute(mlir::omp::LoopOp loopOp, + mlir::ConversionPatternRewriter &rewriter) const { + rewriteToSingleWrapperOp(loopOp, rewriter); + } + + void rewriteToWsloop(mlir::omp::LoopOp loopOp, + mlir::ConversionPatternRewriter &rewriter) const { + rewriteToSingleWrapperOp( + loopOp, rewriter); + } + + // TODO Suggestion by Sergio: tag auto-generated operations for constructs + // that weren't part of the original program, that would be useful + // information for debugging purposes later on. This new attribute could be + // used for `omp.loop`, but also for `do concurrent` transformations, + // `workshare`, `workdistribute`, etc. The tag could be used for all kinds of + // auto-generated operations using a dialect attribute (named something like + // `omp.origin` or `omp.derived`) and perhaps hold the name of the operation + // it was derived from, the reason it was transformed or something like that + // we could use when emitting any messages related to it later on. + template + void + rewriteToSingleWrapperOp(mlir::omp::LoopOp loopOp, + mlir::ConversionPatternRewriter &rewriter) const { + OpOperandsTy clauseOps; + clauseOps.privateVars = loopOp.getPrivateVars(); auto privateSyms = loopOp.getPrivateSyms(); if (privateSyms) - simdClauseOps.privateSyms.assign(privateSyms->begin(), - privateSyms->end()); + clauseOps.privateSyms.assign(privateSyms->begin(), privateSyms->end()); - Fortran::common::openmp::EntryBlockArgs simdArgs; - simdArgs.priv.vars = simdClauseOps.privateVars; + Fortran::common::openmp::EntryBlockArgs args; + args.priv.vars = clauseOps.privateVars; - auto simdOp = - rewriter.create(loopOp.getLoc(), simdClauseOps); - mlir::Block *simdBlock = - genEntryBlock(rewriter, simdArgs, simdOp.getRegion()); + auto wrapperOp = rewriter.create(loopOp.getLoc(), clauseOps); + mlir::Block *opBlock = genEntryBlock(rewriter, args, wrapperOp.getRegion()); mlir::IRMapping mapper; mlir::Block &loopBlock = *loopOp.getRegion().begin(); - for (auto [loopOpArg, simdopArg] : - llvm::zip_equal(loopBlock.getArguments(), simdBlock->getArguments())) - mapper.map(loopOpArg, simdopArg); + for (auto [loopOpArg, opArg] : + llvm::zip_equal(loopBlock.getArguments(), opBlock->getArguments())) + mapper.map(loopOpArg, opArg); rewriter.clone(*loopOp.begin(), mapper); } diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90 index 9fa0de3bfe171..845905da0fcba 100644 --- a/flang/test/Lower/OpenMP/loop-directive.f90 +++ b/flang/test/Lower/OpenMP/loop-directive.f90 @@ -92,7 +92,7 @@ subroutine test_reduction() ! CHECK-LABEL: func.func @_QPtest_bind subroutine test_bind() integer :: i, dummy = 1 - ! CHECK: omp.loop bind(thread) private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) { + ! CHECK: omp.simd private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) { ! CHECK: } !$omp loop bind(thread) do i=1,10 @@ -139,3 +139,43 @@ subroutine test_nested_directives end do !$omp end target teams end subroutine + +! CHECK-LABEL: func.func @_QPtest_standalone_bind_teams +subroutine test_standalone_bind_teams + implicit none + integer, parameter :: N = 100000 + integer a(N), b(N), c(N) + integer j,i, num, flag; + num = N + + ! CHECK: omp.distribute + ! CHECK-SAME: private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}}, + ! CHECK-SAME: @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) { + ! CHECK: omp.loop_nest {{.*}} { + ! CHECK: } + ! CHECK: } + !$omp loop bind(teams) private(a) + do i=1,N + c(i) = a(i) * b(i) + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_standalone_bind_parallel +subroutine test_standalone_bind_parallel + implicit none + integer, parameter :: N = 100000 + integer a(N), b(N), c(N) + integer j,i, num, flag; + num = N + + ! CHECK: omp.wsloop + ! CHECK-SAME: private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}}, + ! CHECK-SAME: @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) { + ! CHECK: omp.loop_nest {{.*}} { + ! CHECK: } + ! CHECK: } + !$omp loop bind(parallel) private(a) + do i=1,N + c(i) = a(i) * b(i) + end do +end subroutine From e7592d83e0ac58f61cfe8dcf61bcc8e7a8bd67b3 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 27 Jan 2025 08:00:07 -0600 Subject: [PATCH 189/432] [Offload][NFC] Make sure the thread is not running already --- offload/plugins-nextgen/common/include/RPC.h | 2 +- offload/plugins-nextgen/common/src/RPC.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h index f3a8e7555020d..42fca4aa4aebc 100644 --- a/offload/plugins-nextgen/common/include/RPC.h +++ b/offload/plugins-nextgen/common/include/RPC.h @@ -99,7 +99,7 @@ struct RPCServerTy { /// Initialize the worker thread to run in the background. ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[], size_t Length) - : Running(true), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length), + : Running(false), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length), Devices(Devices, Length) {} ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); } diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp index 81ad9ca66808a..e6750a540b391 100644 --- a/offload/plugins-nextgen/common/src/RPC.cpp +++ b/offload/plugins-nextgen/common/src/RPC.cpp @@ -99,10 +99,15 @@ static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) { } void RPCServerTy::ServerThread::startThread() { + assert(!Running.load(std::memory_order_relaxed) && + "Attempting to start thread that is already running"); + Running.store(true, std::memory_order_release); Worker = std::thread([this]() { run(); }); } void RPCServerTy::ServerThread::shutDown() { + assert(Running.load(std::memory_order_relaxed) && + "Attempting to shut down a thread that is not running"); { std::lock_guard Lock(Mutex); Running.store(false, std::memory_order_release); From 86705eb6242b5e2d6153708ddedffbfc95491756 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 27 Jan 2025 14:00:03 +0000 Subject: [PATCH 190/432] [X86] huge-stack-offset.ll - add gnux32 test coverage This should match x86 for the basic implementation, but its useful to check it actually runs correctly. --- llvm/test/CodeGen/X86/huge-stack-offset.ll | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll index 6629811a59b23..d6080cfd3f753 100644 --- a/llvm/test/CodeGen/X86/huge-stack-offset.ll +++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-64 ; RUN: llc < %s -mtriple=i386-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-32 +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-32 ; Test that a large stack offset uses a single add/sub instruction to ; adjust the stack pointer. @@ -11,7 +12,7 @@ define void @foo() nounwind { ; CHECK-64-NOT: subq $2147483647, %rsp ; CHECK-64: movabsq $50000000{{..}}, [[RAX:%r..]] ; CHECK-64-NEXT: addq [[RAX]], %rsp - +; ; CHECK-32-LABEL: foo: ; CHECK-32: ud2 ; CHECK-32-NOT: subl $2147483647, %esp @@ -27,7 +28,7 @@ define i32 @foo2() nounwind { ; CHECK-64-LABEL: foo2: ; CHECK-64: movl $10, %eax ; CHECK-64-NOT: movabsq ${{.*}}, %rax - +; ; CHECK-32-LABEL: foo2: ; CHECK-32: movl $10, %eax ; CHECK-32-NOT: movl ${{.*}}, %eax @@ -42,7 +43,7 @@ define i32 @foo3(i32 inreg %x) nounwind { ; CHECK-64-LABEL: foo3: ; CHECK-64: movabsq $50000000{{..}}, %rax ; CHECK-64-NEXT: subq %rax, %rsp - +; ; CHECK-32-LABEL: foo3: ; CHECK-32: ud2 ; CHECK-32-NOT: movl ${{.*}}, %eax From 3684ec425904424fc4dc80c8661f82bc676d7197 Mon Sep 17 00:00:00 2001 From: vdonaldson <37090318+vdonaldson@users.noreply.github.com> Date: Mon, 27 Jan 2025 09:18:47 -0500 Subject: [PATCH 191/432] [flang] IEEE underflow control for Arm (#124170) Update IEEE_SUPPORT_UNDERFLOW_CONTROL, IEEE_GET_UNDERFLOW_MODE, and IEEE_SET_UNDERFLOW_MODE code for Arm. --- flang/include/flang/Tools/TargetSetup.h | 31 ++++++++++----------- flang/runtime/exceptions.cpp | 36 ++++++++++++++++++++----- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h index d1b0da3a42c89..5d23df6823a94 100644 --- a/flang/include/flang/Tools/TargetSetup.h +++ b/flang/include/flang/Tools/TargetSetup.h @@ -24,34 +24,35 @@ namespace Fortran::tools { const std::string &compilerVersion, const std::string &compilerOptions) { const llvm::Triple &targetTriple{targetMachine.getTargetTriple()}; - // FIXME: Handle real(3) ? - if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) { - targetCharacteristics.DisableType( - Fortran::common::TypeCategory::Real, /*kind=*/10); - } + + targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting, true); + if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) { targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3); targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4); targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8); } + if (targetTriple.isARM() || targetTriple.isAArch64()) { targetCharacteristics.set_haltingSupportIsUnknownAtCompileTime(); targetCharacteristics.set_ieeeFeature( evaluate::IeeeFeature::Halting, false); - } else { - targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting); + targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3); + targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4); + targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8); + } + + if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) { + targetCharacteristics.DisableType( + Fortran::common::TypeCategory::Real, /*kind=*/10); } - // Figure out if we can support F128: see - // flang/runtime/Float128Math/math-entries.h - // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support - // cross-compilation + // Check for kind=16 support. See flang/runtime/Float128Math/math-entries.h. + // TODO: Take this from TargetInfo::getLongDoubleFormat for cross compilation. #ifdef FLANG_RUNTIME_F128_MATH_LIB - // we can use libquadmath wrappers - constexpr bool f128Support = true; + constexpr bool f128Support = true; // use libquadmath wrappers #elif HAS_LDBL128 - // we can use libm wrappers - constexpr bool f128Support = true; + constexpr bool f128Support = true; // use libm wrappers #else constexpr bool f128Support = false; #endif diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp index f541b8e844ade..7fca0c431f8cd 100644 --- a/flang/runtime/exceptions.cpp +++ b/flang/runtime/exceptions.cpp @@ -11,7 +11,9 @@ #include "flang/Runtime/exceptions.h" #include "terminator.h" #include -#if __x86_64__ +#if __aarch64__ +#include +#elif __x86_64__ #include #endif @@ -90,20 +92,40 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) { #endif } +// A hardware FZ (flush to zero) bit is the negation of the +// ieee_[get|set]_underflow_mode GRADUAL argument. +#if defined(_MM_FLUSH_ZERO_MASK) +// The MXCSR FZ bit affects computations of real kinds 3, 4, and 8. +#elif defined(_FPU_GETCW) +// The FPCR FZ bit affects computations of real kinds 3, 4, and 8. +// bit 24: FZ -- single, double precision flush to zero bit +// bit 19: FZ16 -- half precision flush to zero bit [not currently relevant] +#define _FPU_FPCR_FZ_MASK_ 0x01080000 +#endif + bool RTNAME(GetUnderflowMode)(void) { -#if _MM_FLUSH_ZERO_MASK - // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode - // GRADUAL argument. It affects real computations of kinds 3, 4, and 8. +#if defined(_MM_FLUSH_ZERO_MASK) return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF; +#elif defined(_FPU_GETCW) + uint32_t fpcr; + _FPU_GETCW(fpcr); + return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_; #else return false; #endif } void RTNAME(SetUnderflowMode)(bool flag) { -#if _MM_FLUSH_ZERO_MASK - // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode - // GRADUAL argument. It affects real computations of kinds 3, 4, and 8. +#if defined(_MM_FLUSH_ZERO_MASK) _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON); +#elif defined(_FPU_GETCW) + uint32_t fpcr; + _FPU_GETCW(fpcr); + if (flag) { + fpcr &= ~_FPU_FPCR_FZ_MASK_; + } else { + fpcr |= _FPU_FPCR_FZ_MASK_; + } + _FPU_SETCW(fpcr); #endif } From 3a4376b8f90686f754ee51b296a064ab03c12895 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 27 Jan 2025 14:21:14 +0000 Subject: [PATCH 192/432] LAA: handle 0 return from getPtrStride correctly (#124539) getPtrStride returns 0 when the PtrScev is loop-invariant, and this is not an erroneous value: it returns std::nullopt to communicate that it was not able to find a valid pointer stride. In analyzeLoop, we call getPtrStride with a value_or(0) conflating the zero return value with std::nullopt. Fix this, handling loop-invariant loads correctly. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 4 +- .../LoopAccessAnalysis/pointer-phis.ll | 8 -- .../LoopDistribute/pointer-phi-in-loop.ll | 132 +++++++++--------- .../Inputs/loop-distribute.ll | 6 +- .../Inputs/loop-distribute.ll.expected | 46 +++--- 5 files changed, 101 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 11e0a221fc887..697b40403902c 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1438,7 +1438,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, bool Assume, bool ShouldCheckWrap) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); if (PSE.getSE()->isLoopInvariant(PtrScev, Lp)) - return {0}; + return 0; Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); @@ -2593,7 +2593,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll index a214451bfd3fd..48586ee9d9ed9 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll @@ -501,14 +501,6 @@ define void @phi_load_store_memdep_check(i1 %c, ptr %A, ptr %B, ptr %C) { ; CHECK-NEXT: %lv3 = load i16, ptr %c.sink, align 2 -> ; CHECK-NEXT: store i16 %add, ptr %c.sink, align 1 ; CHECK-EMPTY: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %lv = load i16, ptr %A, align 1 -> -; CHECK-NEXT: store i16 %lv, ptr %A, align 1 -; CHECK-EMPTY: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: store i16 %lv, ptr %A, align 1 -> -; CHECK-NEXT: %lv2 = load i16, ptr %A, align 1 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Check 0: ; CHECK-NEXT: Comparing group ([[GRP10:0x[0-9a-f]+]]): diff --git a/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll b/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll index 2ab9140baf866..b95551eb94f4c 100644 --- a/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll +++ b/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll @@ -3,26 +3,73 @@ ; Testcases inspired by PR50296, PR50288. -define void @phi_load_store_distribute(i1 %c, ptr %A, ptr %B, ptr %C) { +define void @phi_load_store_distribute(i1 %cond, ptr %A, ptr %B, ptr %C) { ; CHECK-LABEL: @phi_load_store_distribute( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[LV:%.*]] = load i16, ptr [[A:%.*]], align 1 +; CHECK: for.body.lver.check: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 2 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[ENTRY:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]] +; CHECK: for.body.ph.lver.orig: +; CHECK-NEXT: br label [[FOR_BODY_LVER_ORIG:%.*]] +; CHECK: for.body.lver.orig: +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[LV:%.*]] = load i16, ptr [[A]], align 1 ; CHECK-NEXT: store i16 [[LV]], ptr [[A]], align 1 -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]] -; CHECK: if.then: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK: if.then.lver.orig: ; CHECK-NEXT: [[LV2:%.*]] = load i16, ptr [[A]], align 1 ; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end.lver.orig: +; CHECK-NEXT: [[C_SINK_LVER_ORIG:%.*]] = phi ptr [ [[B]], [[IF_THEN]] ], [ [[C]], [[FOR_BODY_LVER_ORIG]] ] +; CHECK-NEXT: [[LV3_LVER_ORIG:%.*]] = load i16, ptr [[C_SINK_LVER_ORIG]], align 2 +; CHECK-NEXT: [[ADD_LVER_ORIG:%.*]] = add i16 [[LV3_LVER_ORIG]], 10 +; CHECK-NEXT: store i16 [[ADD_LVER_ORIG]], ptr [[C_SINK_LVER_ORIG]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 +; CHECK-NEXT: [[TOBOOL_NOT_LVER_ORIG:%.*]] = icmp eq i16 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TOBOOL_NOT_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]] +; CHECK: for.body.ph.ldist1: +; CHECK-NEXT: br label [[FOR_BODY_LDIST1:%.*]] +; CHECK: for.body.ldist1: +; CHECK-NEXT: [[IV_LDIST1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LDIST1]] ], [ [[IV_NEXT_LDIST1:%.*]], [[IF_END_LDIST1:%.*]] ] +; CHECK-NEXT: [[LV_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: store i16 [[LV_LDIST1]], ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]] +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN_LDIST1:%.*]], label [[IF_END_LDIST1]] +; CHECK: if.then.ldist1: +; CHECK-NEXT: [[LV2_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]] +; CHECK-NEXT: br label [[IF_END_LDIST1]] +; CHECK: if.end.ldist1: +; CHECK-NEXT: [[IV_NEXT_LDIST1]] = add nuw nsw i16 [[IV_LDIST1]], 1 +; CHECK-NEXT: [[TOBOOL_NOT_LDIST1:%.*]] = icmp eq i16 [[IV_NEXT_LDIST1]], 1000 +; CHECK-NEXT: br i1 [[TOBOOL_NOT_LDIST1]], label [[FOR_BODY_PH:%.*]], label [[FOR_BODY_LDIST1]] +; CHECK: for.body.ph: +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH]] ], [ [[IV_NEXT1:%.*]], [[IF_END1:%.*]] ] +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN1:%.*]], label [[IF_END1]] +; CHECK: if.then: +; CHECK-NEXT: br label [[IF_END1]] ; CHECK: if.end: -; CHECK-NEXT: [[C_SINK:%.*]] = phi ptr [ [[B:%.*]], [[IF_THEN]] ], [ [[C:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C_SINK:%.*]] = phi ptr [ [[B]], [[IF_THEN1]] ], [ [[C]], [[FOR_BODY1]] ] ; CHECK-NEXT: [[LV3:%.*]] = load i16, ptr [[C_SINK]], align 2 ; CHECK-NEXT: [[ADD:%.*]] = add i16 [[LV3]], 10 ; CHECK-NEXT: store i16 [[ADD]], ptr [[C_SINK]], align 1 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i16 [[IV1]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT1]], 1000 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT_LOOPEXIT6:%.*]], label [[FOR_BODY1]] +; CHECK: for.end.loopexit.loopexit: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: for.end.loopexit.loopexit6: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: ret void ; @@ -33,7 +80,7 @@ for.body: ; preds = %if.end, %entry %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ] %lv = load i16, ptr %A, align 1 store i16 %lv, ptr %A, align 1 - br i1 %c, label %if.then, label %if.end + br i1 %cond, label %if.then, label %if.end if.then: ; preds = %for.body %lv2 = load i16, ptr %A, align 1 @@ -55,66 +102,21 @@ for.end.loopexit: ; preds = %if.end define void @phi_load_distribute(i1 %cond, ptr %A, ptr %B, ptr %C) { ; CHECK-LABEL: @phi_load_distribute( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY_LVER_CHECK:%.*]] -; CHECK: for.body.lver.check: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 2 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 2 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] -; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]] -; CHECK: for.body.ph.lver.orig: ; CHECK-NEXT: br label [[FOR_BODY_LVER_ORIG:%.*]] -; CHECK: for.body.lver.orig: -; CHECK-NEXT: [[IV_LVER_ORIG:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LVER_ORIG]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], [[IF_END_LVER_ORIG:%.*]] ] -; CHECK-NEXT: [[LV_LVER_ORIG:%.*]] = load i16, ptr [[A]], align 1 +; CHECK: for.body: +; CHECK-NEXT: [[IV_LVER_ORIG:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LVER_ORIG:%.*]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], [[IF_END_LVER_ORIG:%.*]] ] +; CHECK-NEXT: [[LV_LVER_ORIG:%.*]] = load i16, ptr [[A:%.*]], align 1 ; CHECK-NEXT: store i16 [[LV_LVER_ORIG]], ptr [[A]], align 1 ; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN_LVER_ORIG:%.*]], label [[IF_END_LVER_ORIG]] -; CHECK: if.then.lver.orig: -; CHECK-NEXT: [[LV2_LVER_ORIG:%.*]] = load i16, ptr [[A]], align 1 -; CHECK-NEXT: br label [[IF_END_LVER_ORIG]] -; CHECK: if.end.lver.orig: -; CHECK-NEXT: [[C_SINK_LVER_ORIG:%.*]] = phi ptr [ [[B]], [[IF_THEN_LVER_ORIG]] ], [ [[C]], [[FOR_BODY_LVER_ORIG]] ] -; CHECK-NEXT: [[LV3_LVER_ORIG:%.*]] = load i16, ptr [[C_SINK_LVER_ORIG]], align 2 -; CHECK-NEXT: [[IV_NEXT_LVER_ORIG]] = add nuw nsw i16 [[IV_LVER_ORIG]], 1 -; CHECK-NEXT: [[TOBOOL_NOT_LVER_ORIG:%.*]] = icmp eq i16 [[IV_NEXT_LVER_ORIG]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]] -; CHECK: for.body.ph.ldist1: -; CHECK-NEXT: br label [[FOR_BODY_LDIST1:%.*]] -; CHECK: for.body.ldist1: -; CHECK-NEXT: [[IV_LDIST1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LDIST1]] ], [ [[IV_NEXT_LDIST1:%.*]], [[IF_END_LDIST1:%.*]] ] -; CHECK-NEXT: [[LV_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: store i16 [[LV_LDIST1]], ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]] -; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN_LDIST1:%.*]], label [[IF_END_LDIST1]] -; CHECK: if.then.ldist1: -; CHECK-NEXT: [[LV2_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]] -; CHECK-NEXT: br label [[IF_END_LDIST1]] -; CHECK: if.end.ldist1: -; CHECK-NEXT: [[IV_NEXT_LDIST1]] = add nuw nsw i16 [[IV_LDIST1]], 1 -; CHECK-NEXT: [[TOBOOL_NOT_LDIST1:%.*]] = icmp eq i16 [[IV_NEXT_LDIST1]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT_LDIST1]], label [[FOR_BODY_PH:%.*]], label [[FOR_BODY_LDIST1]] -; CHECK: for.body.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[FOR_BODY_PH]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: -; CHECK-NEXT: br label [[IF_END]] +; CHECK-NEXT: [[LV2:%.*]] = load i16, ptr [[A]], align 1 +; CHECK-NEXT: br label [[IF_END_LVER_ORIG]] ; CHECK: if.end: -; CHECK-NEXT: [[C_SINK:%.*]] = phi ptr [ [[B]], [[IF_THEN]] ], [ [[C]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C_SINK:%.*]] = phi ptr [ [[B:%.*]], [[IF_THEN_LVER_ORIG]] ], [ [[C:%.*]], [[FOR_BODY_LVER_ORIG]] ] ; CHECK-NEXT: [[LV3:%.*]] = load i16, ptr [[C_SINK]], align 2 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT_LOOPEXIT6:%.*]], label [[FOR_BODY]] -; CHECK: for.end.loopexit.loopexit: -; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: for.end.loopexit.loopexit6: -; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK-NEXT: [[IV_NEXT_LVER_ORIG]] = add nuw nsw i16 [[IV_LVER_ORIG]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT_LVER_ORIG]], 1000 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll index 48f80533c6379..548aa0ab2673b 100644 --- a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll +++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll @@ -1,7 +1,7 @@ ; RUN: opt -passes=loop-distribute -enable-loop-distribute \ ; RUN: -debug-only=loop-distribute -disable-output 2>&1 %s | FileCheck %s -define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) { +define void @ldist(i1 %cond, ptr %A, ptr %B, ptr %C) { entry: br label %for.body @@ -9,7 +9,7 @@ for.body: ; preds = %if.end, %entry %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ] %lv = load i16, ptr %A, align 1 store i16 %lv, ptr %A, align 1 - br i1 %c, label %if.then, label %if.end + br i1 %cond, label %if.then, label %if.end if.then: ; preds = %for.body %lv2 = load i16, ptr %A, align 1 @@ -18,6 +18,8 @@ if.then: ; preds = %for.body if.end: ; preds = %if.then, %for.body %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ] %lv3 = load i16, ptr %c.sink + %add = add i16 %lv3, 10 + store i16 %add, ptr %c.sink, align 1 %iv.next = add nuw nsw i16 %iv, 1 %tobool.not = icmp eq i16 %iv.next, 1000 br i1 %tobool.not, label %for.end.loopexit, label %for.body diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected index baef851b84ee5..eba378c175091 100644 --- a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected @@ -2,51 +2,55 @@ ; RUN: opt -passes=loop-distribute -enable-loop-distribute \ ; RUN: -debug-only=loop-distribute -disable-output 2>&1 %s | FileCheck %s -define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) { +define void @ldist(i1 %cond, ptr %A, ptr %B, ptr %C) { ; CHECK-LABEL: 'ldist' ; CHECK-NEXT: LDist: Found a candidate loop: for.body ; CHECK-NEXT: LDist: Backward dependences: ; CHECK-NEXT: Unknown: -; CHECK-NEXT: %lv = load i16, ptr %A, align 1 -> -; CHECK-NEXT: store i16 %lv, ptr %A, align 1 +; CHECK-NEXT: %lv3 = load i16, ptr %c.sink, align 2 -> +; CHECK-NEXT: store i16 %add, ptr %c.sink, align 1 ; CHECK-NEXT: Unknown: -; CHECK-NEXT: store i16 %lv, ptr %A, align 1 -> -; CHECK-NEXT: %lv2 = load i16, ptr %A, align 1 +; CHECK-NEXT: %lv3 = load i16, ptr %c.sink, align 2 -> +; CHECK-NEXT: store i16 %add, ptr %c.sink, align 1 ; CHECK-NEXT: LDist: Seeded partitions: -; CHECK-NEXT: LDist: Partition 0: (cycle) +; CHECK-NEXT: LDist: Partition 0: ; CHECK-NEXT: for.body: %lv = load i16, ptr %A, align 1 -; CHECK-NEXT: for.body: store i16 %lv, ptr %A, align 1 -; CHECK-NEXT: if.then: %lv2 = load i16, ptr %A, align 1 ; CHECK-NEXT: LDist: Partition 1: -; CHECK-NEXT: if.end: %lv3 = load i16, ptr %c.sink, align 2 +; CHECK-NEXT: for.body: store i16 %lv, ptr %A, align 1 ; CHECK-NEXT: LDist: Partition 2: +; CHECK-NEXT: if.then: %lv2 = load i16, ptr %A, align 1 +; CHECK-NEXT: LDist: Partition 3: (cycle) ; CHECK-NEXT: if.end: %lv3 = load i16, ptr %c.sink, align 2 +; CHECK-NEXT: if.end: store i16 %add, ptr %c.sink, align 1 ; CHECK-NEXT: LDist: Merged partitions: -; CHECK-NEXT: LDist: Partition 0: (cycle) +; CHECK-NEXT: LDist: Partition 0: ; CHECK-NEXT: for.body: %lv = load i16, ptr %A, align 1 ; CHECK-NEXT: for.body: store i16 %lv, ptr %A, align 1 ; CHECK-NEXT: if.then: %lv2 = load i16, ptr %A, align 1 -; CHECK-NEXT: LDist: Partition 1: +; CHECK-NEXT: LDist: Partition 1: (cycle) ; CHECK-NEXT: if.end: %lv3 = load i16, ptr %c.sink, align 2 +; CHECK-NEXT: if.end: store i16 %add, ptr %c.sink, align 1 ; CHECK-NEXT: LDist: Populated partitions: -; CHECK-NEXT: LDist: Partition 0: (cycle) +; CHECK-NEXT: LDist: Partition 0: ; CHECK-NEXT: for.body: %lv = load i16, ptr %A, align 1 ; CHECK-NEXT: for.body: store i16 %lv, ptr %A, align 1 ; CHECK-NEXT: if.then: %lv2 = load i16, ptr %A, align 1 -; CHECK-NEXT: for.body: br i1 %c, label %if.then, label %if.end +; CHECK-NEXT: for.body: br i1 %cond, label %if.then, label %if.end ; CHECK-NEXT: if.then: br label %if.end ; CHECK-NEXT: if.end: br i1 %tobool.not, label %for.end.loopexit, label %for.body ; CHECK-NEXT: if.end: %tobool.not = icmp eq i16 %iv.next, 1000 ; CHECK-NEXT: if.end: %iv.next = add nuw nsw i16 %iv, 1 ; CHECK-NEXT: for.body: %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ] -; CHECK-NEXT: LDist: Partition 1: +; CHECK-NEXT: LDist: Partition 1: (cycle) ; CHECK-NEXT: if.end: %lv3 = load i16, ptr %c.sink, align 2 -; CHECK-NEXT: for.body: br i1 %c, label %if.then, label %if.end +; CHECK-NEXT: if.end: store i16 %add, ptr %c.sink, align 1 +; CHECK-NEXT: for.body: br i1 %cond, label %if.then, label %if.end ; CHECK-NEXT: if.then: br label %if.end ; CHECK-NEXT: if.end: br i1 %tobool.not, label %for.end.loopexit, label %for.body ; CHECK-NEXT: if.end: %tobool.not = icmp eq i16 %iv.next, 1000 ; CHECK-NEXT: if.end: %iv.next = add nuw nsw i16 %iv, 1 ; CHECK-NEXT: for.body: %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ] +; CHECK-NEXT: if.end: %add = add i16 %lv3, 10 ; CHECK-NEXT: if.end: %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ] ; CHECK-NEXT: LDist: Distributing loop: for.body ; CHECK-NEXT: LDist: Pointers: @@ -56,19 +60,21 @@ define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) { ; CHECK-NEXT: ptr %A ; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): ; CHECK-NEXT: ptr %C +; CHECK-NEXT: ptr %C ; CHECK-NEXT: Check 1: ; CHECK-NEXT: Comparing group ([[GRP1]]): ; CHECK-NEXT: ptr %A ; CHECK-NEXT: ptr %A ; CHECK-NEXT: Against group ([[GRP3:0x[0-9a-f]+]]): ; CHECK-NEXT: ptr %B +; CHECK-NEXT: ptr %B ; CHECK-NEXT: LDist: After removing unused Instrs: ; CHECK-NEXT: LDist: Partition 0: ; CHECK-NEXT: for.body.ldist1: ; preds = %if.end.ldist1, %for.body.ph.ldist1 ; CHECK-NEXT: %iv.ldist1 = phi i16 [ 0, %for.body.ph.ldist1 ], [ %iv.next.ldist1, %if.end.ldist1 ] ; CHECK-NEXT: %lv.ldist1 = load i16, ptr %A, align 1, !alias.scope !0, !noalias !3 ; CHECK-NEXT: store i16 %lv.ldist1, ptr %A, align 1, !alias.scope !0, !noalias !3 -; CHECK-NEXT: br i1 %c, label %if.then.ldist1, label %if.end.ldist1 +; CHECK-NEXT: br i1 %cond, label %if.then.ldist1, label %if.end.ldist1 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.ldist1: ; preds = %for.body.ldist1 ; CHECK-NEXT: %lv2.ldist1 = load i16, ptr %A, align 1, !alias.scope !0, !noalias !3 @@ -81,7 +87,7 @@ define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) { ; CHECK-NEXT: LDist: Partition 1: ; CHECK-NEXT: for.body: ; preds = %if.end, %for.body.ph ; CHECK-NEXT: %iv = phi i16 [ 0, %for.body.ph ], [ %iv.next, %if.end ] -; CHECK-NEXT: br i1 %c, label %if.then, label %if.end +; CHECK-NEXT: br i1 %cond, label %if.then, label %if.end ; CHECK-EMPTY: ; CHECK-NEXT: if.then: ; preds = %for.body ; CHECK-NEXT: br label %if.end @@ -89,6 +95,8 @@ define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) { ; CHECK-NEXT: if.end: ; preds = %if.then, %for.body ; CHECK-NEXT: %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ] ; CHECK-NEXT: %lv3 = load i16, ptr %c.sink, align 2 +; CHECK-NEXT: %add = add i16 %lv3, 10 +; CHECK-NEXT: store i16 %add, ptr %c.sink, align 1 ; CHECK-NEXT: %iv.next = add nuw nsw i16 %iv, 1 ; CHECK-NEXT: %tobool.not = icmp eq i16 %iv.next, 1000 ; CHECK-NEXT: br i1 %tobool.not, label %for.end.loopexit.loopexit6, label %for.body @@ -100,7 +108,7 @@ for.body: ; preds = %if.end, %entry %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ] %lv = load i16, ptr %A, align 1 store i16 %lv, ptr %A, align 1 - br i1 %c, label %if.then, label %if.end + br i1 %cond, label %if.then, label %if.end if.then: ; preds = %for.body %lv2 = load i16, ptr %A, align 1 @@ -109,6 +117,8 @@ if.then: ; preds = %for.body if.end: ; preds = %if.then, %for.body %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ] %lv3 = load i16, ptr %c.sink + %add = add i16 %lv3, 10 + store i16 %add, ptr %c.sink, align 1 %iv.next = add nuw nsw i16 %iv, 1 %tobool.not = icmp eq i16 %iv.next, 1000 br i1 %tobool.not, label %for.end.loopexit, label %for.body From f07505849c8e683bf8f444e205d3dd3284759b7d Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 27 Jan 2025 08:29:20 -0600 Subject: [PATCH 193/432] [Offload] Fix server thread from being shut down if unused --- offload/plugins-nextgen/common/src/PluginInterface.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index c9acabea6977d..0d5169191c5af 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1633,11 +1633,12 @@ Error GenericPluginTy::deinit() { if (GlobalHandler) delete GlobalHandler; - if (RPCServer) { + if (RPCServer->Thread->Running.load(std::memory_order_relaxed)) if (Error Err = RPCServer->shutDown()) return Err; + + if (RPCServer) delete RPCServer; - } if (RecordReplay) delete RecordReplay; From 54928a10c8dba7c07c6224c1ead5c02a335890e6 Mon Sep 17 00:00:00 2001 From: Dipesh Sharma <76941383+dipeshs809@users.noreply.github.com> Date: Mon, 27 Jan 2025 20:00:53 +0530 Subject: [PATCH 194/432] [clang] __STDC_NO_THREADS__ is no longer necessary for VS 2022 1939 and above (#117149) Since `__STDC_NO_THREADS__` is a reserved identifier, - If `MSVC version < 17.9` - C version < C11(201112L) - When `` is unavailable `!__has_include()` is `__has_include` is defined. Closes #115529 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/LangOptions.h | 1 + clang/lib/Basic/Targets/OSTargets.cpp | 6 ++++-- .../deprecate-threads-macro-definition-msvc1939.c | 15 +++++++++++++++ clang/test/Preprocessor/init-aarch64.c | 2 +- 5 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c60565a568234..55a4a2e32383a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -387,6 +387,7 @@ C Language Changes ------------------ - Extend clang's ```` to define ``LONG_LONG_*`` macros for Android's bionic. +- Macro ``__STDC_NO_THREADS__`` is no longer necessary for MSVC 2022 1939 and later. C2y Feature Support ^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 949c8f5d448bc..114a5d34a008b 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -144,6 +144,7 @@ class LangOptionsBase { MSVC2019_5 = 1925, MSVC2019_8 = 1928, MSVC2022_3 = 1933, + MSVC2022_9 = 1939, }; enum SYCLMajorVersion { diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp index bf10f9a725567..8af6623e5cb15 100644 --- a/clang/lib/Basic/Targets/OSTargets.cpp +++ b/clang/lib/Basic/Targets/OSTargets.cpp @@ -259,8 +259,10 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) { Builder.defineMacro("_KERNEL_MODE"); Builder.defineMacro("_INTEGRAL_MAX_BITS", "64"); - Builder.defineMacro("__STDC_NO_THREADS__"); - + // Define __STDC_NO_THREADS__ based on MSVC version, threads.h availability, + // and language standard. + if (!(Opts.isCompatibleWithMSVC(LangOptions::MSVC2022_9) && Opts.C11)) + Builder.defineMacro("__STDC_NO_THREADS__"); // Starting with VS 2022 17.1, MSVC predefines the below macro to inform // users of the execution character set defined at compile time. // The value given is the Windows Code Page Identifier: diff --git a/clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c b/clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c new file mode 100644 index 0000000000000..e197d8d403a3f --- /dev/null +++ b/clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c89 -fms-compatibility-version=19.33 -ffreestanding < /dev/null | FileCheck -check-prefix=C89_MSVC33 %s +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c99 -fms-compatibility-version=19.33 -ffreestanding < /dev/null | FileCheck -check-prefix=C99_MSVC33 %s +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c11 -fms-compatibility-version=19.33 -ffreestanding < /dev/null | FileCheck -check-prefix=C11_MSVC33 %s +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c89 -fms-compatibility-version=19.39 -ffreestanding < /dev/null | FileCheck -check-prefix=C89_MSVC39 %s +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c99 -fms-compatibility-version=19.39 -ffreestanding < /dev/null | FileCheck -check-prefix=C99_MSVC39 %s +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c11 -fms-compatibility-version=19.39 -ffreestanding < /dev/null | FileCheck -check-prefix=C11_MSVC39 %s +// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c11 -fms-compatibility-version=19.40 -ffreestanding < /dev/null | FileCheck -check-prefix=C11_MSVC40 %s + +// C89_MSVC33: #define __STDC_NO_THREADS__ 1 +// C99_MSVC33: #define __STDC_NO_THREADS__ 1 +// C11_MSVC33: #define __STDC_NO_THREADS__ 1 +// C89_MSVC39: #define __STDC_NO_THREADS__ 1 +// C99_MSVC39: #define __STDC_NO_THREADS__ 1 +// C11_MSVC39-NOT: #define __STDC_NO_THREADS__ +// C11_MSVC40-NOT: #define __STDC_NO_THREADS__ diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c index b5e77ba10c347..5f47de4b49b69 100644 --- a/clang/test/Preprocessor/init-aarch64.c +++ b/clang/test/Preprocessor/init-aarch64.c @@ -772,7 +772,7 @@ // AARCH64-MSVC: #define __WINT_WIDTH__ 16 // AARCH64-MSVC: #define __aarch64__ 1 -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64ec-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix ARM64EC-MSVC %s +// RUN: %clang_cc1 -E -dM -fms-compatibility-version=19.33 -ffreestanding -triple=arm64ec-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix ARM64EC-MSVC %s // ARM64EC-MSVC: #define _INTEGRAL_MAX_BITS 64 // ARM64EC-MSVC: #define _M_AMD64 100 From f95a8bde3425ada0ef004186eb8ccda6e723241c Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 27 Jan 2025 14:31:41 +0000 Subject: [PATCH 195/432] [AArch64] Refactor implementation of FP8 types (NFC) (#123604) - The FP8 scalar type (`__mfp8`) was described as a vector type - The FP8 vector types were described/assumed to have integer element type (the element type ought to be `__mfp8`) - Add support for `m` type specifier (denoting `__mfp8`) in `DecodeTypeFromStr` and create builtin function prototypes using that specifier, instead of `int8_t` --- .../clang/Basic/AArch64SVEACLETypes.def | 35 +++++++++---------- clang/lib/AST/ASTContext.cpp | 30 +++++++++------- clang/lib/AST/ItaniumMangle.cpp | 2 +- clang/lib/AST/Type.cpp | 4 +-- clang/lib/CodeGen/CodeGenTypes.cpp | 22 +++++++----- clang/lib/CodeGen/Targets/AArch64.cpp | 7 ++-- clang/utils/TableGen/SveEmitter.cpp | 4 +-- 7 files changed, 58 insertions(+), 46 deletions(-) diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def index 2dd2754e778d6..a408bb0c54057 100644 --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -57,6 +57,11 @@ // - IsBF true for vector of brain float elements. //===----------------------------------------------------------------------===// +#ifndef SVE_SCALAR_TYPE +#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \ + SVE_TYPE(Name, Id, SingletonId) +#endif + #ifndef SVE_VECTOR_TYPE #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ SVE_TYPE(Name, Id, SingletonId) @@ -72,6 +77,11 @@ SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true) #endif +#ifndef SVE_VECTOR_TYPE_MFLOAT +#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ + SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, false) +#endif + #ifndef SVE_VECTOR_TYPE_FLOAT #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false) @@ -97,16 +107,6 @@ SVE_TYPE(Name, Id, SingletonId) #endif -#ifndef AARCH64_VECTOR_TYPE -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ - SVE_TYPE(Name, Id, SingletonId) -#endif - -#ifndef AARCH64_VECTOR_TYPE_MFLOAT -#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ - AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) -#endif - //===- Vector point types -----------------------------------------------===// SVE_VECTOR_TYPE_INT("__SVInt8_t", "__SVInt8_t", SveInt8, SveInt8Ty, 16, 8, 1, true) @@ -125,8 +125,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1) -// This is a 8 bits opaque type. -SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t", SveMFloat8, SveMFloat8Ty, 16, 8, 1, false) +SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t", SveMFloat8, SveMFloat8Ty, 16, 8, 1) // // x2 @@ -148,7 +147,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2) -SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false) +SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2) // // x3 @@ -170,7 +169,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3) -SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false) +SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3) // // x4 @@ -192,7 +191,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4) -SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false) +SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4) SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1) SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2) @@ -200,15 +199,15 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy) -AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1) +SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8) #undef SVE_VECTOR_TYPE +#undef SVE_VECTOR_TYPE_MFLOAT #undef SVE_VECTOR_TYPE_BFLOAT #undef SVE_VECTOR_TYPE_FLOAT #undef SVE_VECTOR_TYPE_INT #undef SVE_PREDICATE_TYPE #undef SVE_PREDICATE_TYPE_ALL #undef SVE_OPAQUE_TYPE -#undef AARCH64_VECTOR_TYPE_MFLOAT -#undef AARCH64_VECTOR_TYPE +#undef SVE_SCALAR_TYPE #undef SVE_TYPE diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index a4ba9fd055346..cd1bcb3b9a063 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2269,11 +2269,10 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { Width = 0; \ Align = 16; \ break; -#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ - ElBits, NF) \ +#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \ case BuiltinType::Id: \ - Width = NumEls * ElBits * NF; \ - Align = NumEls * ElBits; \ + Width = Bits; \ + Align = Bits; \ break; #include "clang/Basic/AArch64SVEACLETypes.def" #define PPC_VECTOR_TYPE(Name, Id, Size) \ @@ -4423,15 +4422,14 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const { ElBits, NF) \ case BuiltinType::Id: \ return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF}; +#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + case BuiltinType::Id: \ + return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF}; #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ case BuiltinType::Id: \ return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF}; -#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ - ElBits, NF) \ - case BuiltinType::Id: \ - return {getIntTypeForBitwidth(ElBits, false), \ - llvm::ElementCount::getFixed(NumEls), NF}; -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF, \ @@ -4493,11 +4491,16 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts, EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ return SingletonId; \ } +#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + if (EltTy->isMFloat8Type() && EltTySize == ElBits && \ + NumElts == (NumEls * NF) && NumFields == 1) { \ + return SingletonId; \ + } #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1) \ return SingletonId; -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" } else if (Target->hasRISCVVTypes()) { uint64_t EltTySize = getTypeSize(EltTy); @@ -12382,6 +12385,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, case 'p': Type = Context.getProcessIDType(); break; + case 'm': + Type = Context.MFloat8Ty; + break; } // If there are modifiers and if we're allowed to parse them, go for it. diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 9948963d7f44b..49089c0ea3c8a 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3433,7 +3433,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) { type_name = MangledName; \ Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ break; -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ +#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \ case BuiltinType::Id: \ type_name = MangledName; \ Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index caa0ac858a1be..fde0746a17570 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const { #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ return true; -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ - case BuiltinType::Id: \ - return false; +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" default: return false; diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 950b23f4e13b9..405242e97e75c 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -505,15 +505,18 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { case BuiltinType::Id: #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ - case BuiltinType::Id: -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" { ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(cast(Ty)); - auto VTy = - llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC); + // The `__mfp8` type maps to `<1 x i8>` which can't be used to build + // a vector type, hence bypass the call to `ConvertType` for + // the element type and create the vector type directly. + auto *EltTy = Info.ElementType->isMFloat8Type() + ? llvm::Type::getInt8Ty(getLLVMContext()) + : ConvertType(Info.ElementType); + auto *VTy = llvm::VectorType::get(EltTy, Info.EC); switch (Info.NumVectors) { default: llvm_unreachable("Expected 1, 2, 3 or 4 vectors!"); @@ -529,6 +532,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { } case BuiltinType::SveCount: return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + case BuiltinType::MFloat8: + return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1, + false); #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ ResultType = \ @@ -650,9 +656,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { // An ext_vector_type of Bool is really a vector of bits. llvm::Type *IRElemTy = VT->isExtVectorBoolType() ? llvm::Type::getInt1Ty(getLLVMContext()) - : (VT->getElementType()->isMFloat8Type() - ? llvm::Type::getInt8Ty(getLLVMContext()) - : ConvertType(VT->getElementType())); + : VT->getElementType()->isMFloat8Type() + ? llvm::Type::getInt8Ty(getLLVMContext()) + : ConvertType(VT->getElementType()); ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements()); break; } diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index c702e79ff8eb9..057199c66f5a1 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -244,6 +244,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const VectorType *VT) const { case BuiltinType::SChar: case BuiltinType::UChar: + case BuiltinType::MFloat8: return llvm::ScalableVectorType::get( llvm::Type::getInt8Ty(getVMContext()), 16); @@ -776,8 +777,10 @@ bool AArch64ABIInfo::passAsPureScalableType( NPred += Info.NumVectors; else NVec += Info.NumVectors; - auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType), - Info.EC.getKnownMinValue()); + llvm::Type *EltTy = Info.ElementType->isMFloat8Type() + ? llvm::Type::getInt8Ty(getVMContext()) + : CGT.ConvertType(Info.ElementType); + auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue()); if (CoerceToSeq.size() + Info.NumVectors > 12) return false; diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 0ecbf7cede1da..687d344163e20 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -449,7 +449,7 @@ std::string SVEType::builtinBaseType() const { case TypeKind::PredicatePattern: return "i"; case TypeKind::Fpm: - return "Wi"; + return "UWi"; case TypeKind::Predicate: return "b"; case TypeKind::BFloat16: @@ -457,7 +457,7 @@ std::string SVEType::builtinBaseType() const { return "y"; case TypeKind::MFloat8: assert(ElementBitwidth == 8 && "Invalid MFloat8!"); - return "c"; + return "m"; case TypeKind::Float: switch (ElementBitwidth) { case 16: From f1d5e70a00fbc80f42977800e9299353b06d48cb Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 27 Jan 2025 06:28:45 -0800 Subject: [PATCH 196/432] [SLP][NFC]Do not check poison values for corresponding vectorized entries No need to check poison values if they have been vectorized and/or mark them as vectorized, it should work only for instructions. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index eea6b32460d70..640fcb56aab19 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3643,6 +3643,8 @@ class BoUpSLP { } if (!Last->isGather()) { for (Value *V : VL) { + if (isa(V)) + continue; const TreeEntry *TE = getTreeEntry(V); assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) && "Scalar already in tree!"); From eaa5897534cbd263d0cdbf780f72133c2fe8d8d4 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 27 Jan 2025 14:41:40 +0000 Subject: [PATCH 197/432] [libclc] Optimize CLC vector is(un)ordered builtins (#124546) These are similar to 347fb208, but these builtins are expressed in terms of other builtins. The LLVM IR generated features the same fcmp ord/uno comparisons as before, but consistently in vector form. --- .../clc/include/clc/relational/relational.h | 79 ------------------- .../lib/generic/relational/clc_isordered.cl | 22 +++--- .../lib/generic/relational/clc_isunordered.cl | 26 +++--- 3 files changed, 18 insertions(+), 109 deletions(-) diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h index f32e7630203e4..f269715cfc83c 100644 --- a/libclc/clc/include/clc/relational/relational.h +++ b/libclc/clc/include/clc/relational/relational.h @@ -63,85 +63,6 @@ ARG_TYPE) \ _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) -#define _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, \ - ARG0_TYPE, ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return BUILTIN_NAME(x, y); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo, y.lo), \ - FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo, y.lo), \ - FUNCTION(x.hi, y.hi)} != (RET_TYPE)0); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ - FUNCTION(x.s2, y.s2)} != (RET_TYPE)0); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ - FUNCTION(x.s2, y.s2), \ - FUNCTION(x.s3, y.s3)} != (RET_TYPE)0); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ - FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \ - FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \ - FUNCTION(x.s6, y.s6), \ - FUNCTION(x.s7, y.s7)} != (RET_TYPE)0); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ - FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \ - FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \ - FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), \ - FUNCTION(x.s8, y.s8), FUNCTION(x.s9, y.s9), \ - FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \ - FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), \ - FUNCTION(x.se, y.se), \ - FUNCTION(x.sf, y.sf)} != (RET_TYPE)0); \ - } - -#define _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) \ - _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE##2, FUNCTION, ARG0_TYPE##2, \ - ARG1_TYPE##2) \ - _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE##3, FUNCTION, ARG0_TYPE##3, \ - ARG1_TYPE##3) \ - _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE##4, FUNCTION, ARG0_TYPE##4, \ - ARG1_TYPE##4) \ - _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE##8, FUNCTION, ARG0_TYPE##8, \ - ARG1_TYPE##8) \ - _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE##16, FUNCTION, ARG0_TYPE##16, \ - ARG1_TYPE##16) - -#define _CLC_DEFINE_RELATIONAL_BINARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, \ - ARG0_TYPE, ARG1_TYPE) \ - _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, \ - ARG0_TYPE, ARG1_TYPE) \ - _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE, \ - ARG1_TYPE) - #define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \ ARG1_TYPE, ARG2_TYPE) \ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ diff --git a/libclc/clc/lib/generic/relational/clc_isordered.cl b/libclc/clc/lib/generic/relational/clc_isordered.cl index 6183d1ddf918f..73cd96a0a56ed 100644 --- a/libclc/clc/lib/generic/relational/clc_isordered.cl +++ b/libclc/clc/lib/generic/relational/clc_isordered.cl @@ -2,33 +2,29 @@ #include #include -#define _CLC_DEFINE_ISORDERED(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ - return __clc_isequal(x, x) && __clc_isequal(y, y); \ - } +#define _CLC_RELATIONAL_OP(X, Y) \ + __clc_isequal((X), (X)) && __clc_isequal((Y), (Y)) -_CLC_DEFINE_ISORDERED(int, __clc_isordered, float, float) -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isordered, float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isordered, float, float) #ifdef cl_khr_fp64 + #pragma OPENCL EXTENSION cl_khr_fp64 : enable // The scalar version of __clc_isordered(double, double) returns an int, but the // vector versions return long. - -_CLC_DEFINE_ISORDERED(int, __clc_isordered, double, double) -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isordered, double, double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isordered, double, double) #endif + #ifdef cl_khr_fp16 + #pragma OPENCL EXTENSION cl_khr_fp16 : enable // The scalar version of __clc_isordered(half, half) returns an int, but the // vector versions return short. - -_CLC_DEFINE_ISORDERED(int, __clc_isordered, half, half) -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isordered, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isordered, half, half) #endif -#undef _CLC_DEFINE_ISORDERED +#undef _CLC_RELATIONAL_OP diff --git a/libclc/clc/lib/generic/relational/clc_isunordered.cl b/libclc/clc/lib/generic/relational/clc_isunordered.cl index dbbec031a65e5..fefda8e567517 100644 --- a/libclc/clc/lib/generic/relational/clc_isunordered.cl +++ b/libclc/clc/lib/generic/relational/clc_isunordered.cl @@ -1,12 +1,11 @@ #include +#include #include -// Note: It would be nice to use __builtin_isunordered with vector inputs, but -// it seems to only take scalar values as input, which will produce incorrect -// output for vector input types. +#define _CLC_RELATIONAL_OP(X, Y) \ + !__clc_isequal((X), (X)) || !__clc_isequal((Y), (Y)) -_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isunordered, __builtin_isunordered, - float, float) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isunordered, float, float) #ifdef cl_khr_fp64 @@ -14,25 +13,18 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isunordered, __builtin_isunordered, // The scalar version of __clc_isunordered(double, double) returns an int, but // the vector versions return long. - -_CLC_DEF _CLC_OVERLOAD int __clc_isunordered(double x, double y) { - return __builtin_isunordered(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isunordered, double, double) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isunordered, double, double) #endif + #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable // The scalar version of __clc_isunordered(half, half) returns an int, but the // vector versions return short. - -_CLC_DEF _CLC_OVERLOAD int __clc_isunordered(half x, half y) { - return __builtin_isunordered(x, y); -} - -_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isunordered, half, half) +_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isunordered, half, half) #endif + +#undef _CLC_RELATIONAL_OP From 561132e71b29d9b747dfda1509f715847852f77b Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Mon, 27 Jan 2025 15:50:51 +0100 Subject: [PATCH 198/432] [Clang] Fix immediate escalation of template function specializations. (#124404) We record whether an expression is immediate escalating in the FunctionScope. However, that only happen when parsing or transforming an expression. This might not happen when transforming a non dependent expression. This patch fixes that by considering a function immediate when instantiated from an immediate function. Fixes #123405 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/AST/Decl.cpp | 4 ++++ .../test/SemaCXX/cxx2b-consteval-propagate.cpp | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 55a4a2e32383a..031c5d84e49f9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1003,6 +1003,7 @@ Bug Fixes to C++ Support - Fixed assertions or false compiler diagnostics in the case of C++ modules for lambda functions or inline friend functions defined inside templates (#GH122493). - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423) +- Fixed immediate escalation of non-dependent expressions. (#GH123405) - Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498) - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234) diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 74bcb618f2950..728556614e632 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3314,6 +3314,10 @@ bool FunctionDecl::isImmediateFunction() const { .getConstructor() ->isImmediateFunction(); + if (FunctionDecl *P = getTemplateInstantiationPattern(); + P && P->isImmediateFunction()) + return true; + if (const auto *MD = dyn_cast(this); MD && MD->isLambdaStaticInvoker()) return MD->getParent()->getLambdaCallOperator()->isImmediateFunction(); diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp index 3f3123eaee76b..222d482f40aa5 100644 --- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp +++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp @@ -528,3 +528,21 @@ D d(0); // expected-note {{in implicit initialization for inherited constructor // expected-error@-1 {{call to immediate function 'GH112677::D::SimpleCtor' is not a constant expression}} } + +namespace GH123405 { + +consteval void fn() {} + +template +constexpr int tfn(int) { + auto p = &fn; // expected-note {{'tfn' is an immediate function because its body evaluates the address of a consteval function 'fn'}} + return int(p); // expected-error {{cast from pointer to smaller type 'int' loses information}} +} + +int g() { + int a; // expected-note {{declared here}} + return tfn(a); // expected-error {{call to immediate function 'GH123405::tfn' is not a constant expression}}\ + // expected-note {{read of non-const variable 'a' is not allowed in a constant expression}} +} + +} From 081723b9db84e78d7dd240b46af2aeb3b51b00be Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 27 Jan 2025 14:56:47 +0000 Subject: [PATCH 199/432] [lldb][TypeSystem] Ensure that ParmVarDecls have the correct DeclContext (#124279) While sifting through this part of the code I noticed that when we parse C++ methods, `DWARFASTParserClang` creates two sets of `ParmVarDecls`, one in `ParseChildParameters` and once in `AddMethodToCXXRecordType`. The former is unused when we're dealing with methods. Moreover, the `ParmVarDecls` we created in `ParseChildParameters` were created with an incorrect `clang::DeclContext` (namely the DeclContext of the function, and not the function itself). In Clang, there's `ParmVarDecl::setOwningFunction` to adjust the DeclContext of a parameter if the parameter was created before the FunctionDecl. But we never used it. This patch removes the `ParmVarDecl` creation from `ParseChildParameters` and instead creates a `TypeSystemClang::CreateParameterDeclarations` that ensures we set the DeclContext correctly. Note there is one differences in how `ParmVarDecl`s would be created now: we won't set a ClangASTMetadata entry for any of the parameters. I don't think this was ever actually useful for parameter DIEs anyway. This wasn't causing any concrete issues (that I know of), but was quite surprising. And this way of setting the parameters seems easier to reason about (in my opinion). --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 36 ++-- .../SymbolFile/DWARF/DWARFASTParserClang.h | 12 +- .../SymbolFile/NativePDB/PdbAstBuilder.cpp | 2 +- .../Plugins/SymbolFile/PDB/PDBASTParser.cpp | 4 +- .../TypeSystem/Clang/TypeSystemClang.cpp | 50 +++--- .../TypeSystem/Clang/TypeSystemClang.h | 21 ++- lldb/unittests/Symbol/TestTypeSystemClang.cpp | 42 +++++ .../DWARF/DWARFASTParserClangTests.cpp | 170 ++++++++++++++++++ 8 files changed, 285 insertions(+), 52 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index e77188bfbd2e4..6602dd763ba69 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1272,7 +1272,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, return_clang_type = m_ast.GetBasicType(eBasicTypeVoid); std::vector function_param_types; - std::vector function_param_decls; + llvm::SmallVector function_param_names; // Parse the function children for the parameters @@ -1284,7 +1284,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, if (die.HasChildren()) { ParseChildParameters(containing_decl_ctx, die, is_variadic, has_template_params, function_param_types, - function_param_decls); + function_param_names); } bool is_cxx_method = DeclKindIsCXXClass(containing_decl_ctx->getDeclKind()); @@ -1414,12 +1414,14 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, LinkDeclContextToDIE(function_decl, die); - if (!function_param_decls.empty()) { - m_ast.SetFunctionParameters(function_decl, function_param_decls); - if (template_function_decl) - m_ast.SetFunctionParameters(template_function_decl, - function_param_decls); - } + const clang::FunctionProtoType *function_prototype( + llvm::cast( + ClangUtil::GetQualType(clang_type).getTypePtr())); + const auto params = m_ast.CreateParameterDeclarations( + function_decl, *function_prototype, function_param_names); + function_decl->setParams(params); + if (template_function_decl) + template_function_decl->setParams(params); ClangASTMetadata metadata; metadata.SetUserID(die.GetID()); @@ -2380,7 +2382,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) { bool is_variadic = false; bool has_template_params = false; std::vector param_types; - std::vector param_decls; + llvm::SmallVector param_names; StreamString sstr; DWARFDeclContext decl_ctx = die.GetDWARFDeclContext(); @@ -2394,7 +2396,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) { die, GetCXXObjectParameter(die, *containing_decl_ctx)); ParseChildParameters(containing_decl_ctx, die, is_variadic, - has_template_params, param_types, param_decls); + has_template_params, param_types, param_names); sstr << "("; for (size_t i = 0; i < param_types.size(); i++) { if (i > 0) @@ -3157,7 +3159,7 @@ void DWARFASTParserClang::ParseChildParameters( clang::DeclContext *containing_decl_ctx, const DWARFDIE &parent_die, bool &is_variadic, bool &has_template_params, std::vector &function_param_types, - std::vector &function_param_decls) { + llvm::SmallVectorImpl &function_param_names) { if (!parent_die) return; @@ -3168,22 +3170,14 @@ void DWARFASTParserClang::ParseChildParameters( if (die.GetAttributeValueAsUnsigned(DW_AT_artificial, 0)) continue; - const char *name = die.GetName(); DWARFDIE param_type_die = die.GetAttributeValueAsReferenceDIE(DW_AT_type); Type *type = die.ResolveTypeUID(param_type_die); if (!type) break; + function_param_names.emplace_back(die.GetName()); function_param_types.push_back(type->GetForwardCompilerType()); - - clang::ParmVarDecl *param_var_decl = m_ast.CreateParameterDeclaration( - containing_decl_ctx, GetOwningClangModule(die), name, - type->GetForwardCompilerType(), clang::StorageClass::SC_None); - assert(param_var_decl); - function_param_decls.push_back(param_var_decl); - - m_ast.SetMetadataAsUserID(param_var_decl, die.GetID()); } break; case DW_TAG_unspecified_parameters: @@ -3205,6 +3199,8 @@ void DWARFASTParserClang::ParseChildParameters( break; } } + + assert(function_param_names.size() == function_param_names.size()); } clang::Decl *DWARFASTParserClang::GetClangDeclForDIE(const DWARFDIE &die) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index a5c3746ada4c3..d1eb2bcc2592e 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -186,12 +186,12 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { const lldb::AccessType default_accessibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info); - void - ParseChildParameters(clang::DeclContext *containing_decl_ctx, - const lldb_private::plugin::dwarf::DWARFDIE &parent_die, - bool &is_variadic, bool &has_template_params, - std::vector &function_args, - std::vector &function_param_decls); + void ParseChildParameters( + clang::DeclContext *containing_decl_ctx, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + bool &is_variadic, bool &has_template_params, + std::vector &function_param_types, + llvm::SmallVectorImpl &function_param_names); size_t ParseChildEnumerators( const lldb_private::CompilerType &compiler_type, bool is_signed, diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp index 0c71df625ae34..5d4b22d08b111 100644 --- a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp +++ b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp @@ -1137,7 +1137,7 @@ void PdbAstBuilder::CreateFunctionParameters(PdbCompilandSymId func_id, } if (!params.empty() && params.size() == param_count) - m_clang.SetFunctionParameters(&function_decl, params); + function_decl.setParams(params); } clang::QualType PdbAstBuilder::CreateEnumType(PdbTypeSymId id, diff --git a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp index fa3530a0c22ff..990bacd89bf34 100644 --- a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp +++ b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp @@ -975,8 +975,8 @@ PDBASTParser::GetDeclForSymbol(const llvm::pdb::PDBSymbol &symbol) { } } } - if (params.size()) - m_ast.SetFunctionParameters(decl, params); + if (params.size() && decl) + decl->setParams(params); m_uid_to_decl[sym_id] = decl; diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 47051f2e68090..fc3dbfa311c9b 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -2217,12 +2217,6 @@ ParmVarDecl *TypeSystemClang::CreateParameterDeclaration( return decl; } -void TypeSystemClang::SetFunctionParameters( - FunctionDecl *function_decl, llvm::ArrayRef params) { - if (function_decl) - function_decl->setParams(params); -} - CompilerType TypeSystemClang::CreateBlockPointerType(const CompilerType &function_type) { QualType block_type = m_ast_up->getBlockPointerType( @@ -7708,6 +7702,32 @@ void TypeSystemClang::SetFloatingInitializerForVariable( ast, init_value, true, qt.getUnqualifiedType(), SourceLocation())); } +llvm::SmallVector +TypeSystemClang::CreateParameterDeclarations( + clang::FunctionDecl *func, const clang::FunctionProtoType &prototype, + const llvm::SmallVector ¶meter_names) { + assert(func); + assert(parameter_names.empty() || + parameter_names.size() == prototype.getNumParams()); + + llvm::SmallVector params; + for (unsigned param_index = 0; param_index < prototype.getNumParams(); + ++param_index) { + llvm::StringRef name = + !parameter_names.empty() ? parameter_names[param_index] : ""; + + auto *param = + CreateParameterDeclaration(func, /*owning_module=*/{}, name.data(), + GetType(prototype.getParamType(param_index)), + clang::SC_None, /*add_decl=*/false); + assert(param); + + params.push_back(param); + } + + return params; +} + clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType( lldb::opaque_compiler_type_t type, llvm::StringRef name, const char *mangled_name, const CompilerType &method_clang_type, @@ -7848,20 +7868,10 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType( getASTContext(), mangled_name, /*literal=*/false)); } - // Populate the method decl with parameter decls - - llvm::SmallVector params; - - for (unsigned param_index = 0; param_index < num_params; ++param_index) { - params.push_back(clang::ParmVarDecl::Create( - getASTContext(), cxx_method_decl, clang::SourceLocation(), - clang::SourceLocation(), - nullptr, // anonymous - method_function_prototype->getParamType(param_index), nullptr, - clang::SC_None, nullptr)); - } - - cxx_method_decl->setParams(llvm::ArrayRef(params)); + // Parameters on member function declarations in DWARF generally don't + // have names, so we omit them when creating the ParmVarDecls. + cxx_method_decl->setParams(CreateParameterDeclarations( + cxx_method_decl, *method_function_prototype, /*parameter_names=*/{})); AddAccessSpecifierDecl(cxx_record_decl, getASTContext(), GetCXXRecordDeclAccess(cxx_record_decl), diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index 678eaed381fd4..83f954270e309 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -489,9 +489,6 @@ class TypeSystemClang : public TypeSystem { const char *name, const CompilerType ¶m_type, int storage, bool add_decl = false); - void SetFunctionParameters(clang::FunctionDecl *function_decl, - llvm::ArrayRef params); - CompilerType CreateBlockPointerType(const CompilerType &function_type); // Array Types @@ -976,6 +973,24 @@ class TypeSystemClang : public TypeSystem { SetFloatingInitializerForVariable(clang::VarDecl *var, const llvm::APFloat &init_value); + /// For each parameter type of \c prototype, creates a \c clang::ParmVarDecl + /// whose \c clang::DeclContext is \c context. + /// + /// \param[in] context Non-null \c clang::FunctionDecl which will be the \c + /// clang::DeclContext of each parameter created/returned by this function. + /// \param[in] prototype The \c clang::FunctionProtoType of \c context. + /// \param[in] param_names The ith element of this vector contains the name + /// of the ith parameter. This parameter may be unnamed, in which case the + /// ith entry in \c param_names is an empty string. This vector is either + /// empty, or will have an entry for *each* parameter of the prototype + /// regardless of whether a parameter is unnamed or not. + /// + /// \returns A list of newly created of non-null \c clang::ParmVarDecl (one + /// for each parameter of \c prototype). + llvm::SmallVector CreateParameterDeclarations( + clang::FunctionDecl *context, const clang::FunctionProtoType &prototype, + const llvm::SmallVector ¶m_names); + clang::CXXMethodDecl *AddMethodToCXXRecordType( lldb::opaque_compiler_type_t type, llvm::StringRef name, const char *mangled_name, const CompilerType &method_type, diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp index a2d1f6db80277..23374062127e0 100644 --- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp +++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp @@ -1040,3 +1040,45 @@ TEST_F(TestTypeSystemClang, GetDeclContextByNameWhenMissingSymbolFile) { EXPECT_TRUE(decls.empty()); } + +TEST_F(TestTypeSystemClang, AddMethodToCXXRecordType_ParmVarDecls) { + // Tests that AddMethodToCXXRecordType creates ParmVarDecl's with + // a correct clang::DeclContext. + + llvm::StringRef class_name = "S"; + CompilerType t = clang_utils::createRecord(*m_ast, class_name); + m_ast->StartTagDeclarationDefinition(t); + + CompilerType return_type = m_ast->GetBasicType(lldb::eBasicTypeVoid); + const bool is_virtual = false; + const bool is_static = false; + const bool is_inline = false; + const bool is_explicit = true; + const bool is_attr_used = false; + const bool is_artificial = false; + + llvm::SmallVector param_types{ + m_ast->GetBasicType(lldb::eBasicTypeInt), + m_ast->GetBasicType(lldb::eBasicTypeShort)}; + CompilerType function_type = m_ast->CreateFunctionType( + return_type, param_types.data(), /*num_params*/ param_types.size(), + /*variadic=*/false, /*quals*/ 0U); + m_ast->AddMethodToCXXRecordType( + t.GetOpaqueQualType(), "myFunc", nullptr, function_type, + lldb::AccessType::eAccessPublic, is_virtual, is_static, is_inline, + is_explicit, is_attr_used, is_artificial); + + // Complete the definition and check the created record. + m_ast->CompleteTagDeclarationDefinition(t); + + auto *record = llvm::cast(ClangUtil::GetAsTagDecl(t)); + + auto method_it = record->method_begin(); + ASSERT_NE(method_it, record->method_end()); + + EXPECT_EQ(method_it->getNumParams(), param_types.size()); + + // DeclContext of each parameter should be the CXXMethodDecl itself. + EXPECT_EQ(method_it->getParamDecl(0)->getDeclContext(), *method_it); + EXPECT_EQ(method_it->getParamDecl(1)->getDeclContext(), *method_it); +} diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index 8adda6fba3a0b..6c77736113da3 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -1082,3 +1082,173 @@ TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) { clang::Qualifiers::fromCVRMask(clang::Qualifiers::Const | clang::Qualifiers::Volatile)); } + +TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ParameterCreation) { + // Tests parsing of a C++ free function will create clang::ParmVarDecls with + // the correct clang::DeclContext. + // + // Also ensures we attach names to the ParmVarDecls (even when DWARF contains + // a mix of named/unnamed parameters). + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + - int + - short + - namedParam + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Code: 0x4 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x5 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x6 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") + - AbbrCode: 0x3 + Values: + - Value: 0x0 + - Value: 0x1 + - Value: 0x1 + +# DW_TAG_formal_parameter +# DW_AT_type [DW_FORM_ref4] (int) + - AbbrCode: 0x4 + Values: + - Value: 0x23 + +# DW_TAG_formal_parameter +# DW_AT_type [DW_FORM_ref4] (short) +# DW_AT_name [DW_FORM_strp] ("namedParam") + - AbbrCode: 0x5 + Values: + - Value: 0x2a + - Value: 0xf + + - AbbrCode: 0x0 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ("int") +# DW_AT_encoding [DW_FORM_data1] +# DW_AT_byte_size [DW_FORM_data1] + + - AbbrCode: 0x6 + Values: + - Value: 0x0000000000000005 + - Value: 0x0000000000000005 # DW_ATE_signed + - Value: 0x0000000000000004 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ("short") +# DW_AT_encoding [DW_FORM_data1] +# DW_AT_byte_size [DW_FORM_data1] + + - AbbrCode: 0x6 + Values: + - Value: 0x0000000000000009 + - Value: 0x0000000000000005 # DW_ATE_signed + - Value: 0x0000000000000004 + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto ts_or_err = + cu_die.GetDWARF()->GetTypeSystemForLanguage(eLanguageTypeC_plus_plus); + ASSERT_TRUE(static_cast(ts_or_err)); + llvm::consumeError(ts_or_err.takeError()); + + auto *ts = static_cast(ts_or_err->get()); + auto *parser = static_cast(ts->GetDWARFParser()); + + auto subprogram = cu_die.GetFirstChild(); + ASSERT_TRUE(subprogram.IsValid()); + ASSERT_EQ(subprogram.Tag(), DW_TAG_subprogram); + + SymbolContext sc; + bool new_type; + auto type_sp = parser->ParseTypeFromDWARF(sc, subprogram, &new_type); + ASSERT_NE(type_sp, nullptr); + + auto result = ts->GetTranslationUnitDecl()->lookup( + clang_utils::getDeclarationName(*ts, "func")); + ASSERT_TRUE(result.isSingleResult()); + + auto const *func = llvm::cast(result.front()); + + EXPECT_EQ(func->getNumParams(), 2U); + EXPECT_EQ(func->getParamDecl(0)->getDeclContext(), func); + EXPECT_TRUE(func->getParamDecl(0)->getName().empty()); + EXPECT_EQ(func->getParamDecl(1)->getDeclContext(), func); + EXPECT_EQ(func->getParamDecl(1)->getName(), "namedParam"); +} From 5c5bbffe75caaaefdc68305e85a625a057b09159 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 27 Jan 2025 14:57:09 +0000 Subject: [PATCH 200/432] [clang][ASTImporter] Import source location of explicit object parameter instead of copying it (#124305) We used to copy the `SourceLocation` instead of importing it, which isn't correct since the `SourceManager`'s of the source and target ASTContext might differ. Also adds test that confirms that we import the explicit object parameter location for `ParmVarDecl`s. This is how Clang determines whether a parameter `isExplicitObjectParamater`. The LLDB expression evaluator relies on this for calling "explicit object member functions". --- clang/lib/AST/ASTImporter.cpp | 8 ++++++-- clang/unittests/AST/ASTImporterTest.cpp | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 0669aa1b809c3..be1a65a49622d 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -4701,9 +4701,13 @@ ExpectedDecl ASTNodeImporter::VisitImplicitParamDecl(ImplicitParamDecl *D) { Error ASTNodeImporter::ImportDefaultArgOfParmVarDecl( const ParmVarDecl *FromParam, ParmVarDecl *ToParam) { + + if (auto LocOrErr = import(FromParam->getExplicitObjectParamThisLoc())) + ToParam->setExplicitObjectParameterLoc(*LocOrErr); + else + return LocOrErr.takeError(); + ToParam->setHasInheritedDefaultArg(FromParam->hasInheritedDefaultArg()); - ToParam->setExplicitObjectParameterLoc( - FromParam->getExplicitObjectParamThisLoc()); ToParam->setKNRPromoted(FromParam->isKNRPromoted()); if (FromParam->hasUninstantiatedDefaultArg()) { diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 791248e7a394f..e77860521335e 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -3441,6 +3441,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportParmVarDecl) { ASSERT_TRUE(FromVar); ASSERT_TRUE(FromVar->hasUninstantiatedDefaultArg()); ASSERT_TRUE(FromVar->getUninstantiatedDefaultArg()); + ASSERT_FALSE(FromVar->isExplicitObjectParameter()); const auto *ToVar = Import(FromVar, Lang_CXX11); EXPECT_TRUE(ToVar); @@ -3448,6 +3449,25 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportParmVarDecl) { EXPECT_TRUE(ToVar->getUninstantiatedDefaultArg()); EXPECT_NE(FromVar->getUninstantiatedDefaultArg(), ToVar->getUninstantiatedDefaultArg()); + EXPECT_FALSE(ToVar->isExplicitObjectParameter()); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportParmVarDecl_Explicit) { + const auto *Code = R"( + struct Wrapper { + void func(this Wrapper) {} + }; + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX23); + auto *FromVar = FirstDeclMatcher().match(FromTU, parmVarDecl()); + ASSERT_TRUE(FromVar); + ASSERT_TRUE(FromVar->isExplicitObjectParameter()); + + const auto *ToVar = Import(FromVar, Lang_CXX23); + EXPECT_TRUE(ToVar); + EXPECT_TRUE(ToVar->isExplicitObjectParameter()); + EXPECT_NE(ToVar->getExplicitObjectParamThisLoc(), + FromVar->getExplicitObjectParamThisLoc()); } TEST_P(ASTImporterOptionSpecificTestBase, ImportOfNonEquivalentField) { From 1f5335c1db5d54b4465677c224b48e0ffc78e6d9 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Mon, 27 Jan 2025 07:04:50 -0800 Subject: [PATCH 201/432] Make index computation used divsi/remsi (#124390) The index computation is meant to be signed. Using unsigned could lead to subtle errors. Fix places where some index math was using unsigned operations. Signed-off-by: MaheshRavishankar --- mlir/lib/Dialect/Arith/Utils/Utils.cpp | 2 +- .../Linalg/Transforms/ElementwiseOpFusion.cpp | 5 +- .../TosaToTensor/tosa-to-tensor.mlir | 24 +++---- .../Linalg/data-layout-propagation.mlir | 6 +- .../fuse-with-reshape-by-collapsing.mlir | 38 +++++------ .../Dialect/Linalg/fusion-push-reshape.mlir | 2 +- mlir/test/Dialect/Linalg/reshape_fusion.mlir | 66 +++++++++---------- mlir/test/Dialect/Tensor/bufferize.mlir | 4 +- 8 files changed, 74 insertions(+), 73 deletions(-) diff --git a/mlir/lib/Dialect/Arith/Utils/Utils.cpp b/mlir/lib/Dialect/Arith/Utils/Utils.cpp index 39c9005e449e3..8dde9866b22b3 100644 --- a/mlir/lib/Dialect/Arith/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Arith/Utils/Utils.cpp @@ -69,7 +69,7 @@ mlir::inferExpandShapeOutputShape(OpBuilder &b, Location loc, Value indexGroupSize = cast(inputShape[inputIndex]); Value indexGroupStaticSizesProduct = b.create(loc, indexGroupStaticSizesProductInt); - Value dynamicDimSize = b.createOrFold( + Value dynamicDimSize = b.createOrFold( loc, indexGroupSize, indexGroupStaticSizesProduct); outputShapeValues.push_back(dynamicDimSize); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index 3a57f368d4425..60cae77644291 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" @@ -1572,9 +1573,9 @@ void generateCollapsedIndexingRegion(Location loc, Block *block, rewriter.create(loc, foldedDims.index()); for (auto dim : llvm::reverse(foldedDimsRef.drop_front())) { indexReplacementVals[dim] = - rewriter.create(loc, newIndexVal, loopRange[dim]); + rewriter.create(loc, newIndexVal, loopRange[dim]); newIndexVal = - rewriter.create(loc, newIndexVal, loopRange[dim]); + rewriter.create(loc, newIndexVal, loopRange[dim]); } indexReplacementVals[foldedDims.value().front()] = newIndexVal; } diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir index 2f11b31aad230..27018fb79f60d 100644 --- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir +++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir @@ -86,7 +86,7 @@ func.func @test_reshape_1d_down_s2s_explicit(%arg0: tensor<1xf32>) -> tensor // CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C2]] : index +// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C2]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[ARG_0]] {{\[\[}}0, 1]] output_shape [2, %[[VAL_0]]] : tensor into tensor<2x?xf32> // CHECK: return %[[EXPANDED]] : tensor<2x?xf32> func.func @test_reshape_1d_up_d2d_auto(%arg0: tensor) -> tensor<2x?xf32> { @@ -135,7 +135,7 @@ func.func @test_reshape_2d_down_s2s_explicit(%arg0: tensor<2x3xf32>) -> tensor<6 // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C2]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C2]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] output_shape [2, %[[DIV]]] : tensor into tensor<2x?xf32> // CHECK: return %[[EXPANDED]] : tensor<2x?xf32> func.func @test_reshape_2d_same_d2d_auto(%arg0: tensor) -> tensor<2x?xf32> { @@ -189,7 +189,7 @@ func.func @test_reshape_2d_same_s2s_explicit(%arg0: tensor<3x2xf32>) -> tensor<2 // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C0_0]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C0_0]] : index // CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [0, 3, %[[DIV]]] : tensor into tensor<0x3x?xf32> // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<0x3x?xf32> to tensor // CHECK: return %[[VAL_2]] : tensor @@ -206,7 +206,7 @@ func.func @test_reshape_3d_same_d2d_auto_empty(%arg0: tensor<3x2x?xf32>) -> tens // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C8:.*]] = arith.constant 8 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C8]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C8]] : index // CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, %[[DIV]], 4] : tensor into tensor<2x?x4xf32> // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<2x?x4xf32> to tensor // CHECK: return %[[VAL_2]] : tensor @@ -223,7 +223,7 @@ func.func @test_reshape_3d_same_d2d_auto(%arg0: tensor<2x?x?xf32>) -> tensor // CHECK: %[[C6:.*]] = arith.constant 6 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C6]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C6]] : index // CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, 3, %[[DIV]]] : tensor into tensor<2x3x?xf32> // CHECK: return %[[VAL_1]] : tensor<2x3x?xf32> func.func @test_reshape_3d_same_d2d_auto_identity(%arg0: tensor) -> tensor<2x3x?xf32> { @@ -239,7 +239,7 @@ func.func @test_reshape_3d_same_d2d_auto_identity(%arg0: tensor) -> t // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C6:.*]] = arith.constant 6 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C6]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C6]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 2] : tensor into tensor // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor to tensor // CHECK: return %[[VAL_2]] : tensor @@ -256,7 +256,7 @@ func.func @test_reshape_3d_same_d2d_explicit_empty(%arg0: tensor<3x2x?xf32>) -> // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C12]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C12]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 4] : tensor into tensor // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor to tensor // CHECK: return %[[VAL_2]] : tensor @@ -284,7 +284,7 @@ func.func @test_reshape_3d_same_d2d_explicit_identity(%arg0: tensor) // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C8:.*]] = arith.constant 8 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C8]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C8]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, %[[DIV]], 4] : tensor into tensor<2x?x4xf32> // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<2x?x4xf32> to tensor<2x3x4xf32> // CHECK: return %[[VAL_2]] : tensor<2x3x4xf32> @@ -301,7 +301,7 @@ func.func @test_reshape_3d_same_d2s_auto(%arg0: tensor) -> tensor<2x3 // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor // CHECK: %[[C12:.*]] = arith.constant 12 : index -// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C12]] : index +// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C12]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 4] : tensor into tensor // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor to tensor<2x3x4xf32> // CHECK: return %[[VAL_2]] : tensor<2x3x4xf32> @@ -328,7 +328,7 @@ func.func @test_reshape_3d_same_s2s_explicit_identity(%arg0: tensor<2x3x4xf32>) // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor // CHECK: %[[C6:.*]] = arith.constant 6 : index -// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C6]] : index +// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C6]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2, 3]] output_shape [%[[VAL_0]], 3, 2, 1] : tensor into tensor // CHECK: %[[CAST:.*]] = tensor.cast %[[EXPANDED]] : tensor to tensor<1x3x2x1xf32> // CHECK: return %[[CAST]] : tensor<1x3x2x1xf32> @@ -357,7 +357,7 @@ func.func @test_reshape_4d_down_d2s_explicit(%arg0: tensor) -> tens // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor // CHECK: %[[C6:.*]] = arith.constant 6 : index -// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C6]] : index +// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C6]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2]] output_shape [%[[VAL_0]], 2, 3] : tensor into tensor // CHECK: return %[[EXPANDED]] : tensor func.func @test_reshape_5d_down_d2d_auto(%arg0: tensor) -> tensor { @@ -373,7 +373,7 @@ func.func @test_reshape_5d_down_d2d_auto(%arg0: tensor) -> tensor // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor // CHECK: %[[C385:.*]] = arith.constant 385 : index -// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C385]] : index +// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C385]] : index // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2]] output_shape [%[[VAL_0]], 5, 77] : tensor into tensor // CHECK: return %[[EXPANDED]] : tensor func.func @test_reshape_6d_down_d2d_auto(%arg0: tensor<1x2x?x5x7x11xf32>) -> tensor { diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir index 07708231a6e2f..cb8064411bbae 100644 --- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir +++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir @@ -1301,7 +1301,7 @@ func.func @push_down_unpack_through_expand(%5: tensor, %dim: index // CHECK: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor -// CHECK: %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C32]] : index +// CHECK: %[[SZ0:.+]] = arith.divsi %[[DIM0]], %[[C32]] : index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor @@ -1322,7 +1322,7 @@ func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor -// CHECK: %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C32]] : index +// CHECK: %[[SZ0:.+]] = arith.divsi %[[DIM0]], %[[C32]] : index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor @@ -1373,7 +1373,7 @@ func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor, // CHECK: %[[C256:.+]] = arith.constant 256 : index // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor -// CHECK: %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C256]] : index +// CHECK: %[[SZ0:.+]] = arith.divsi %[[DIM0]], %[[C256]] : index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] output_shape [%[[SZ0]], 256, 32, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor diff --git a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir index f17881d59a266..7db997cd4c0b5 100644 --- a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir +++ b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir @@ -99,14 +99,14 @@ func.func @fuse_by_collapsing_indexing_op(%arg0 : tensor<2x12x5x336x9xi32>, // CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index // CHECK: %[[IV0:.+]] = linalg.index 0 // CHECK: %[[IV1:.+]] = linalg.index 1 -// CHECK: %[[REM_IV1:.+]] = arith.remui %[[IV1]], %[[C4]] -// CHECK: %[[DIV_IV1:.+]] = arith.divui %[[IV1]], %[[C4]] +// CHECK: %[[REM_IV1:.+]] = arith.remsi %[[IV1]], %[[C4]] +// CHECK: %[[DIV_IV1:.+]] = arith.divsi %[[IV1]], %[[C4]] // CHECK: %[[IV2:.+]] = linalg.index 2 // CHECK: %[[IV3:.+]] = linalg.index 3 -// CHECK: %[[REM1_IV3:.+]] = arith.remui %[[IV3]], %[[C8]] -// CHECK: %[[DIV1_IV3:.+]] = arith.divui %[[IV3]], %[[C8]] -// CHECK: %[[REM2_IV3:.+]] = arith.remui %[[DIV1_IV3]], %[[C7]] -// CHECK: %[[DIV2_IV3:.+]] = arith.divui %[[DIV1_IV3]], %[[C7]] +// CHECK: %[[REM1_IV3:.+]] = arith.remsi %[[IV3]], %[[C8]] +// CHECK: %[[DIV1_IV3:.+]] = arith.divsi %[[IV3]], %[[C8]] +// CHECK: %[[REM2_IV3:.+]] = arith.remsi %[[DIV1_IV3]], %[[C7]] +// CHECK: %[[DIV2_IV3:.+]] = arith.divsi %[[DIV1_IV3]], %[[C7]] // CHECK: %[[IV4:.+]] = linalg.index 4 // CHECK: %[[T0:.+]] = arith.addi %[[IV0]], %[[DIV_IV1]] // CHECK: %[[T1:.+]] = arith.addi %[[T0]], %[[REM_IV1]] @@ -215,13 +215,13 @@ func.func @fuse_by_collapsing_dynamic(%arg0 : tensor, // CHECK-DAG: %[[D1:.+]] = tensor.dim %[[EXPAND]], %[[C5]] // CHECK: linalg.generic // CHECK: %[[IV0:.+]] = linalg.index 1 -// CHECK: %[[REM1_IV0:.+]] = arith.remui %[[IV0]], %[[C5]] -// CHECK: %[[DIV1_IV0:.+]] = arith.divui %[[IV0]], %[[C5]] -// CHECK: %[[REM2_IV0:.+]] = arith.remui %[[DIV1_IV0]], %[[D1]] -// CHECK: %[[DIV2_IV0:.+]] = arith.divui %[[DIV1_IV0]], %[[D1]] +// CHECK: %[[REM1_IV0:.+]] = arith.remsi %[[IV0]], %[[C5]] +// CHECK: %[[DIV1_IV0:.+]] = arith.divsi %[[IV0]], %[[C5]] +// CHECK: %[[REM2_IV0:.+]] = arith.remsi %[[DIV1_IV0]], %[[D1]] +// CHECK: %[[DIV2_IV0:.+]] = arith.divsi %[[DIV1_IV0]], %[[D1]] // CHECK: %[[IV1:.+]] = linalg.index 3 -// CHECK: %[[REM1_IV1:.+]] = arith.remui %[[IV1]], %[[D0]] -// CHECK: %[[DIV1_IV1:.+]] = arith.divui %[[IV1]], %[[D0]] +// CHECK: %[[REM1_IV1:.+]] = arith.remsi %[[IV1]], %[[D0]] +// CHECK: %[[DIV1_IV1:.+]] = arith.divsi %[[IV1]], %[[D0]] // ----- @@ -439,7 +439,7 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor, %arg1 : tensor<4 // CHECK-SAME: outs(%[[COLLAPSE_ARG1_1]] : // CHECK: %[[DIM:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<4x?x?xf32> // CHECK: %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C2]] : tensor<4x?x?xf32> -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C8]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C8]] : index // CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[VAL_1]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32> // CHECK: return %[[EXPANDED_3]] @@ -492,11 +492,11 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor, %sz0: index, %sz1: // CHECK-SAME: outs(%[[COLLAPSE_INIT]] : // CHECK-NEXT: ^bb{{[0-9]}} // CHECK: %[[ID0:.+]] = linalg.index 0 -// CHECK-DAG: %[[T0:.+]] = arith.remui %[[ID0]], %[[C4]] -// CHECK-DAG: %[[T1:.+]] = arith.divui %[[ID0]], %[[C4]] +// CHECK-DAG: %[[T0:.+]] = arith.remsi %[[ID0]], %[[C4]] +// CHECK-DAG: %[[T1:.+]] = arith.divsi %[[ID0]], %[[C4]] // CHECK: %[[ID1:.+]] = linalg.index 1 -// CHECK-DAG: %[[T2:.+]] = arith.remui %[[ID1]], %[[C8]] -// CHECK-DAG: %[[T3:.+]] = arith.divui %[[ID1]], %[[C8]] +// CHECK-DAG: %[[T2:.+]] = arith.remsi %[[ID1]], %[[C8]] +// CHECK-DAG: %[[T3:.+]] = arith.divsi %[[ID1]], %[[C8]] // CHECK-DAG: %[[T4:.+]] = arith.addi %[[T1]], %[[T2]] // CHECK-DAG: %[[T5:.+]] = arith.addi %[[T4]], %[[T0]] // CHECK-DAG: %[[T6:.+]] = arith.addi %[[T5]], %[[T3]] @@ -504,8 +504,8 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor, %sz0: index, %sz1: // CHECK: linalg.yield %[[T7]] // CHECK: %[[DIM_1:.+]] = tensor.dim %[[GENERIC]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C8]] : index -// CHECK: %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C4]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C8]] : index +// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C4]] : index // CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 4] : tensor into tensor // CHECK: return %[[EXPANDED_3]] diff --git a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir index 751ece37bc094..7acbd843cd1e7 100644 --- a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir +++ b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir @@ -12,7 +12,7 @@ // CHECK-SAME: iterator_types = ["parallel", "parallel"]} // CHECK-SAME: ins(%[[A]], %[[B]] : tensor, tensor<16xf32>) outs(%[[RI]] : tensor) // CHECK: %[[DIM:.*]] = tensor.dim %[[R]], %[[C0]] : tensor -// CHECK: %[[VAL_1:.*]] = arith.divui %[[DIM]], %[[C112]] : index +// CHECK: %[[VAL_1:.*]] = arith.divsi %[[DIM]], %[[C112]] : index // CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[VAL_1]], 112, 16] : tensor into tensor // CHECK: return %[[RR]] : tensor func.func @reshape(%A: tensor, %B: tensor<16xf32>, %init: tensor, %sz0: index) -> tensor { diff --git a/mlir/test/Dialect/Linalg/reshape_fusion.mlir b/mlir/test/Dialect/Linalg/reshape_fusion.mlir index b8df5fc88e199..ef853e4d662a7 100644 --- a/mlir/test/Dialect/Linalg/reshape_fusion.mlir +++ b/mlir/test/Dialect/Linalg/reshape_fusion.mlir @@ -37,12 +37,12 @@ func.func @generic_op_reshape_producer_fusion(%arg0 : tensor, // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM_1]], %[[C4]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM_1]], %[[C4]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1], [2, 3]] output_shape [%[[DIM]], %[[DIM_0]], %[[VAL_0]], 4] : tensor into tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor // CHECK: %[[DIM_4:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_4]], %[[C4]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_4]], %[[C4]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1], [2, 3]] output_shape [%[[DIM_2]], %[[DIM_3]], %[[VAL_1]], 4] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP5]], #[[MAP6]], #[[MAP7]], #[[MAP6]]] @@ -93,15 +93,15 @@ func.func @generic_op_reshape_consumer_fusion(%arg0 : tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM_0]], %[[C20]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM_0]], %[[C20]] : index // CHECK: %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM]], 4, %[[VAL_0]], 5] : tensor into tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C20]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], 4, %[[VAL_1]], 5] : tensor into tensor // CHECK: %[[DIM_4:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_5]], %[[C20]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], 4, %[[VAL_2]], 5] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP2]], #[[MAP3]], #[[MAP2]]] @@ -144,18 +144,18 @@ func.func @reshape_as_consumer_permutation // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C12]] : index -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C2]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C12]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C2]] : index // CHECK: %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [3, 4, %[[VAL_0]], %[[VAL_1]], 2, %[[DIM_1]]] : tensor into tensor<3x4x?x?x2x?xf32> // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_2]], %[[C12]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_2]], %[[C12]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [3, 4, %[[VAL_2]], %[[DIM_3]]] : tensor into tensor<3x4x?x?xf32> // CHECK: %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_6:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK: %[[DIM_7:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor -// CHECK: %[[VAL_3:.+]] = arith.divui %[[DIM_5]], %[[C2]] : index -// CHECK: %[[VAL_4:.+]] = arith.divui %[[DIM_7]], %[[C12]] : index +// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_5]], %[[C2]] : index +// CHECK: %[[VAL_4:.+]] = arith.divsi %[[DIM_7]], %[[C12]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [%[[VAL_3]], 2, %[[DIM_6]], 3, 4, %[[VAL_4]]] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP8]], #[[MAP9]], #[[MAP10]]] @@ -463,15 +463,15 @@ func.func @generic_op_reshape_consumer_fusion_projected(%arg0 : tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C20]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C20]] : index // CHECK: %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_0]], 4, 5, %[[DIM_0]]] : tensor into tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_1]], %[[C20]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_1]], %[[C20]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_1]], 4, 5, %[[DIM_2]]] : tensor into tensor // CHECK: %[[DIM_4:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_5]], %[[C20]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], %[[VAL_2]], 4, 5] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP4]], #[[MAP4]], #[[MAP5]]] @@ -569,24 +569,24 @@ func.func @reshape_as_consumer_permutation_with_multiple_results // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C12]] : index -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C2]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C12]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C2]] : index // CHECK: %[[RESHAPE0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [3, 4, %[[VAL_0]], %[[VAL_1]], 2, %[[DIM_1]]] : tensor into tensor<3x4x?x?x2x?xf32> // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_2]], %[[C12]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_2]], %[[C12]] : index // CHECK: %[[RESHAPE1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [3, 4, %[[VAL_2]], %[[DIM_3]]] : tensor into tensor<3x4x?x?xf32> // CHECK: %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_6:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK: %[[DIM_7:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor -// CHECK: %[[VAL_3:.+]] = arith.divui %[[DIM_5]], %[[C2]] : index -// CHECK: %[[VAL_4:.+]] = arith.divui %[[DIM_7]], %[[C12]] : index +// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_5]], %[[C2]] : index +// CHECK: %[[VAL_4:.+]] = arith.divsi %[[DIM_7]], %[[C12]] : index // CHECK: %[[RESHAPE2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [%[[VAL_3]], 2, %[[DIM_6]], 3, 4, %[[VAL_4]]] : tensor into tensor // CHECK: %[[DIM_9:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_10:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK: %[[DIM_11:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor -// CHECK: %[[VAL_5:.+]] = arith.divui %[[DIM_10]], %[[C2]] : index -// CHECK: %[[VAL_6:.+]] = arith.divui %[[DIM_11]], %[[C12]] : index +// CHECK: %[[VAL_5:.+]] = arith.divsi %[[DIM_10]], %[[C2]] : index +// CHECK: %[[VAL_6:.+]] = arith.divsi %[[DIM_11]], %[[C12]] : index // CHECK: %[[RESHAPE3:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2], [3, 4, 5]] output_shape [%[[DIM_9]], %[[VAL_5]], 2, 3, 4, %[[VAL_6]]] : tensor into tensor // CHECK: %[[GENERIC:.+]]:2 = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP3]]] @@ -667,11 +667,11 @@ func.func @generic_op_reshape_consumer_fusion_reduction(%arg0 : tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C20]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C20]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_0]], 4, 5, %[[DIM_0]]] : tensor into tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C20]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], %[[VAL_1]], 4, 5] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] @@ -719,13 +719,13 @@ func.func @generic_op_reshape_producer_fusion_with_reduction(%arg0 : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C8]] : index -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C7]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C8]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C7]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1], [2], [3, 4]] output_shape [%[[VAL_0]], 8, 4, %[[VAL_1]], 7] : tensor into tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C8]] : index -// CHECK: %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C7]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C8]] : index +// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C7]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 7] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]] @@ -764,15 +764,15 @@ func.func @linalg_add_reshape_consumer_fusion(%arg0 : tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM_0]], %[[C20]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM_0]], %[[C20]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM]], %[[VAL_0]], 4, 5] : tensor into tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C20]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], %[[VAL_1]], 4, 5] : tensor into tensor // CHECK: %[[DIM_4:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor // CHECK: %[[DIM_5:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_5]], %[[C20]] : index // CHECK: %[[T3:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], %[[VAL_2]], 4, 5] : tensor into tensor // CHECK: %[[T4:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]] @@ -809,13 +809,13 @@ func.func @linalg_add_reshape_producer_fusion(%arg0 : tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor -// CHECK: %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C7]] : index -// CHECK: %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C8]] : index +// CHECK: %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C7]] : index +// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C8]] : index // CHECK: %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_0]], 7, %[[VAL_1]], 8] : tensor into tensor // CHECK: %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor // CHECK: %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C7]] : index -// CHECK: %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C8]] : index +// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C7]] : index +// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C8]] : index // CHECK: %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 7, %[[VAL_3]], 8] : tensor into tensor // CHECK: %[[T3:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]] diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir index ecd285be46194..9ea0a15f31185 100644 --- a/mlir/test/Dialect/Tensor/bufferize.mlir +++ b/mlir/test/Dialect/Tensor/bufferize.mlir @@ -372,7 +372,7 @@ func.func @tensor.expand_shape(%t1: tensor, %sz0: index) -> tensor<2x? // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = memref.dim %[[m1]], %[[C0]] : memref // CHECK: %[[C2:.*]] = arith.constant 2 : index - // CHECK: %[[VAL_1:.*]] = arith.divui %[[DIM]], %[[C2]] : index + // CHECK: %[[VAL_1:.*]] = arith.divsi %[[DIM]], %[[C2]] : index // CHECK: %[[expanded:.*]] = memref.expand_shape %[[m1]] {{\[\[}}0, 1], [2]] output_shape [2, %[[VAL_1]], 10] : memref into memref<2x?x10xf32> %0 = tensor.expand_shape %t1 [[0, 1], [2]] output_shape [2, %sz0, 10] : tensor into tensor<2x?x10xf32> @@ -393,7 +393,7 @@ func.func @tensor.expand_shape_of_slice( %0 = tensor.extract_slice %t1[%o1, 5][%s1, 10][1, 1] : tensor to tensor // CHECK: %[[C7:.*]] = arith.constant 7 : index - // CHECK: %[[VAL_1:.*]] = arith.divui %{{.*}}, %[[C7]] : index + // CHECK: %[[VAL_1:.*]] = arith.divsi %{{.*}}, %[[C7]] : index // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_1]], 7, 2, 5] : memref> into memref> %1 = tensor.expand_shape %0 [[0, 1], [2, 3]] output_shape [%sz0, 7, 2, 5] : tensor into tensor From 27c917307563eae93c7fef9c3944e56e1f5b5f6d Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Mon, 27 Jan 2025 23:04:57 +0800 Subject: [PATCH 202/432] [Clang] Remove unnecessary Decl transform & profiles for SizeOfPackExpr (#124533) We used to always transform the pattern declaration for SizeOfPackExpr to ensure the constraint expression's profile produced the desired result. However, this approach failed to handle pack expansions when the pack referred to function parameters. In such cases, the function parameters were formerly expanded to 1 to avoid building Subst* nodes (see e6974daa7). That workaround caused us to transform a pack without a proper ArgumentPackSubstitutionIndex, leading to crashes when transforming the pattern. It turns out that profiling the pattern for partially substituted SizeOfPackExprs is unnecessary because their transformed forms are also profiled within the partial arguments. Fixes https://github.com/llvm/llvm-project/issues/124161 --- clang/include/clang/AST/ExprCXX.h | 2 -- clang/lib/AST/StmtProfile.cpp | 2 +- clang/lib/Sema/SemaTemplateInstantiate.cpp | 26 ----------------- .../SemaTemplate/concepts-out-of-line-def.cpp | 28 +++++++++++++++++++ 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 2a130bc6da79a..7b0450b90d564 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4326,8 +4326,6 @@ class SizeOfPackExpr final /// Retrieve the parameter pack. NamedDecl *getPack() const { return Pack; } - void setPack(NamedDecl *NewPack) { Pack = NewPack; } - /// Retrieve the length of the parameter pack. /// /// This routine may only be invoked when the expression is not diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 85b59f714ba84..5d1f370cac19f 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2270,13 +2270,13 @@ void StmtProfiler::VisitPackExpansionExpr(const PackExpansionExpr *S) { void StmtProfiler::VisitSizeOfPackExpr(const SizeOfPackExpr *S) { VisitExpr(S); - VisitDecl(S->getPack()); if (S->isPartiallySubstituted()) { auto Args = S->getPartialArguments(); ID.AddInteger(Args.size()); for (const auto &TA : Args) VisitTemplateArgument(TA); } else { + VisitDecl(S->getPack()); ID.AddInteger(0); } } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 3dc5696bd3821..3c6b7ce2949c1 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1762,23 +1762,6 @@ namespace { return inherited::TransformLambdaBody(E, Body); } - ExprResult TransformSizeOfPackExpr(SizeOfPackExpr *E) { - ExprResult Transformed = inherited::TransformSizeOfPackExpr(E); - if (!Transformed.isUsable()) - return Transformed; - auto *TransformedExpr = cast(Transformed.get()); - if (SemaRef.CodeSynthesisContexts.back().Kind == - Sema::CodeSynthesisContext::ConstraintNormalization && - TransformedExpr->getPack() == E->getPack()) { - Decl *NewPack = - TransformDecl(E->getPackLoc(), TransformedExpr->getPack()); - if (!NewPack) - return ExprError(); - TransformedExpr->setPack(cast(NewPack)); - } - return TransformedExpr; - } - ExprResult TransformRequiresExpr(RequiresExpr *E) { LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); ExprResult TransReq = inherited::TransformRequiresExpr(E); @@ -1902,15 +1885,6 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) { TemplateArgument Arg = TemplateArgs(TTP->getDepth(), TTP->getPosition()); if (TTP->isParameterPack()) { - // We might not have an index for pack expansion when normalizing - // constraint expressions. In that case, resort to instantiation scopes - // for the transformed declarations. - if (SemaRef.ArgumentPackSubstitutionIndex == -1 && - SemaRef.CodeSynthesisContexts.back().Kind == - Sema::CodeSynthesisContext::ConstraintNormalization) { - return SemaRef.FindInstantiatedDecl(Loc, cast(D), - TemplateArgs); - } assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp index 6c1a229a9fdda..5af4ec75cae90 100644 --- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp +++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp @@ -722,6 +722,34 @@ template struct d; } // namespace GH115098 +namespace GH123441 { + +struct buf { + constexpr buf(auto&&... initList) requires (sizeof...(initList) <= 8); +}; + +constexpr buf::buf(auto&&... initList) requires (sizeof...(initList) <= 8) {} + +template +struct buffer { + constexpr buffer(auto&&... initList) requires (sizeof...(initList) <= 8); +}; + +template +constexpr buffer::buffer(auto&&... initList) requires (sizeof...(initList) <= 8) {} + +template +struct foo { // expected-note {{foo defined here}} + constexpr foo(auto&&... initList) + requires (sizeof...(initList) <= 8); +}; + +template +constexpr foo::foo(auto&&... initList) // expected-error {{does not match any declaration}} + requires (sizeof...(T) <= 8) {} + +} // namespace GH123441 + namespace GH114685 { template struct ptr { From 092372da15e5165be14cdbb7cac3cf4976fd82d0 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Mon, 27 Jan 2025 07:05:34 -0800 Subject: [PATCH 203/432] [mlir][Tensor] Rework `ReifyRankedShapedTypeInterface` implementation for `tensor.expand_shape` op. (#113501) The op carries the output-shape directly. This can be used directly. Also adds a method to get the shape as a `SmallVector`. Signed-off-by: MaheshRavishankar --- .../mlir/Dialect/Tensor/IR/TensorOps.td | 3 + .../mlir/Dialect/Utils/StaticValueUtils.h | 3 + .../IR/TensorInferTypeOpInterfaceImpl.cpp | 115 +++--------------- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 4 + mlir/lib/Dialect/Utils/StaticValueUtils.cpp | 10 +- mlir/lib/Interfaces/InferTypeOpInterface.cpp | 8 -- .../resolve-shaped-type-result-dims.mlir | 7 +- mlir/test/Dialect/Tensor/fold-empty-op.mlir | 9 +- 8 files changed, 43 insertions(+), 116 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td index 8ad1b23cb2bfe..3ef7c74fd3af1 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -1165,6 +1165,9 @@ def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> { let extraClassDeclaration = commonExtraClassDeclaration # [{ int64_t getCorrespondingSourceDim(int64_t resultDim); + // Return output shape as mixes static/dynamic shapes. + SmallVector getMixedOutputShape(); + // Infer the output shape for a tensor.expand_shape when it is possible // to do so. static FailureOr> inferOutputShape( diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h index d1f7ab1156248..2a3a2defb810d 100644 --- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h +++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h @@ -144,6 +144,9 @@ bool isEqualConstantIntOrValueArray(ArrayRef ofrs1, /// Return a vector of OpFoldResults with the same size a staticValues, but /// all elements for which ShapedType::isDynamic is true, will be replaced by /// dynamicValues. +SmallVector getMixedValues(ArrayRef staticValues, + ValueRange dynamicValues, + MLIRContext *context); SmallVector getMixedValues(ArrayRef staticValues, ValueRange dynamicValues, Builder &b); diff --git a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp index 7ff435a033985..f6fea08e2e717 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp @@ -16,24 +16,6 @@ using namespace mlir; using namespace mlir::tensor; -/// Compute a map that for a given dimension of the expanded type gives the -/// dimension in the collapsed type it maps to. Essentially its the inverse of -/// the `reassocation` maps. -static llvm::DenseMap -getExpandedDimToCollapsedDimMap(ArrayRef reassociation) { - llvm::DenseMap expandedDimToCollapsedDim; - for (const auto &map : enumerate(reassociation)) { - unsigned startPos = - cast(map.value().getResults().front()).getPosition(); - unsigned endPos = - cast(map.value().getResults().back()).getPosition(); - for (auto dim : llvm::seq_inclusive(startPos, endPos)) { - expandedDimToCollapsedDim[dim] = map.index(); - } - } - return expandedDimToCollapsedDim; -} - /// For reshape op compute the shape at dimension `dimIndex` of the output in /// terms of shape of the `src`, when the reshape op is a collapsing /// operation. It is the product of the shape of the collapsed dimensions of the @@ -76,84 +58,15 @@ static SmallVector getCollapsedOutputShapeFromInputShape( })); } -/// For an expanding reshape op, compute the value for a dimension of the output -/// from the shape of the input. -static OpFoldResult getExpandedOutputDimFromInputShape( - OpBuilder &builder, Location loc, int64_t dimIndex, Value src, - ArrayRef dstStaticShape, ArrayRef reassociation, - llvm::DenseMap &expandedDimToCollapsedDim) { - if (!ShapedType::isDynamic(dstStaticShape[dimIndex])) { - // Static dimension: return Attribute. - return builder.getIndexAttr(dstStaticShape[dimIndex]); - } - unsigned sourceDimPos = expandedDimToCollapsedDim[dimIndex]; - unsigned startPos = - cast(reassociation[sourceDimPos].getResults().front()) - .getPosition(); - unsigned endPos = - cast(reassociation[sourceDimPos].getResults().back()) - .getPosition(); - int64_t linearizedStaticDim = 1; - for (auto d : - llvm::enumerate(dstStaticShape.slice(startPos, endPos - startPos + 1))) { - if (d.index() + startPos == static_cast(dimIndex)) - continue; - assert(!ShapedType::isDynamic(d.value()) && - "single dimension cannot be expanded into multiple dynamic " - "dimensions"); - linearizedStaticDim *= d.value(); - } - OpFoldResult sourceDim = - builder.create(loc, src, sourceDimPos).getResult(); - - // Dynamic dimension: return Value. - return affine::makeComposedAffineApply( - builder, loc, - AffineMap::get( - 0, 1, - builder.getAffineSymbolExpr(0).floorDiv(linearizedStaticDim)), - sourceDim) - ->getResult(0); -} - -/// Given the `src` of an expanding reshape op, the reassociation maps and the -/// result type, compute the shape of the result of the reshape. -static SmallVector getExpandedOutputShapeFromInputShape( - OpBuilder &builder, Location loc, Value src, - ArrayRef dstStaticShape, ArrayRef reassociation) { - llvm::DenseMap expandedDimToCollapsedDim = - getExpandedDimToCollapsedDimMap(reassociation); - return llvm::to_vector<4>(llvm::map_range( - llvm::seq(0, dstStaticShape.size()), [&](int64_t dim) { - return getExpandedOutputDimFromInputShape(builder, loc, dim, src, - dstStaticShape, reassociation, - expandedDimToCollapsedDim); - })); -} - -static SmallVector -getReshapeOutputShapeFromInputShape(OpBuilder &builder, Location loc, Value src, - ArrayRef dstStaticShape, - ArrayRef reassocation) { - return dstStaticShape.size() > - static_cast( - llvm::cast(src.getType()).getRank()) - ? getExpandedOutputShapeFromInputShape( - builder, loc, src, dstStaticShape, reassocation) - : getCollapsedOutputShapeFromInputShape( - builder, loc, src, dstStaticShape, reassocation); -} - -template -struct ReifyExpandOrCollapseShapeOp +struct ReifyCollapseShapeOp : public ReifyRankedShapedTypeOpInterface::ExternalModel< - ReifyExpandOrCollapseShapeOp, OpTy> { + ReifyCollapseShapeOp, CollapseShapeOp> { LogicalResult reifyResultShapes(Operation *op, OpBuilder &b, ReifiedRankedShapedTypeDims &reifiedReturnShapes) const { auto loc = op->getLoc(); - auto reshapeOp = cast(op); - reifiedReturnShapes.push_back(getReshapeOutputShapeFromInputShape( + auto reshapeOp = cast(op); + reifiedReturnShapes.push_back(getCollapsedOutputShapeFromInputShape( b, loc, reshapeOp.getSrc(), reshapeOp.getResultType().getShape(), reshapeOp.getReassociationMaps())); return success(); @@ -162,6 +75,20 @@ struct ReifyExpandOrCollapseShapeOp namespace { +struct ReifyExpandShapeOp + : public ReifyRankedShapedTypeOpInterface::ExternalModel { + LogicalResult + reifyResultShapes(Operation *op, OpBuilder &b, + ReifiedRankedShapedTypeDims &reifyResultShapes) const { + auto expandShapeOp = cast(op); + SmallVector resultShapes = + expandShapeOp.getMixedOutputShape(); + reifyResultShapes.emplace_back(std::move(resultShapes)); + return success(); + } +}; + struct ReifyPadOp : public ReifyRankedShapedTypeOpInterface::ExternalModel { @@ -202,10 +129,8 @@ struct ReifyPadOp void mlir::tensor::registerInferTypeOpInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { - ExpandShapeOp::attachInterface< - ReifyExpandOrCollapseShapeOp>(*ctx); - CollapseShapeOp::attachInterface< - ReifyExpandOrCollapseShapeOp>(*ctx); + ExpandShapeOp::attachInterface(*ctx); + CollapseShapeOp::attachInterface(*ctx); PadOp::attachInterface(*ctx); }); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 24a1d55315319..117908129561f 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -1732,6 +1732,10 @@ ExpandShapeOp::inferOutputShape(OpBuilder &b, Location loc, return *outputShape; } +SmallVector ExpandShapeOp::getMixedOutputShape() { + return getMixedValues(getStaticOutputShape(), getOutputShape(), getContext()); +} + void ExpandShapeOp::build(OpBuilder &builder, OperationState &result, Type resultType, Value src, ArrayRef reassociation, diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp index 5c8f6ded39ba4..fcb736aa031f3 100644 --- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp +++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp @@ -191,7 +191,8 @@ bool isEqualConstantIntOrValueArray(ArrayRef ofrs1, /// elements for which ShapedType::isDynamic is true, will be replaced by /// dynamicValues. SmallVector getMixedValues(ArrayRef staticValues, - ValueRange dynamicValues, Builder &b) { + ValueRange dynamicValues, + MLIRContext *context) { SmallVector res; res.reserve(staticValues.size()); unsigned numDynamic = 0; @@ -200,10 +201,15 @@ SmallVector getMixedValues(ArrayRef staticValues, int64_t value = staticValues[idx]; res.push_back(ShapedType::isDynamic(value) ? OpFoldResult{dynamicValues[numDynamic++]} - : OpFoldResult{b.getI64IntegerAttr(staticValues[idx])}); + : OpFoldResult{IntegerAttr::get( + IntegerType::get(context, 64), staticValues[idx])}); } return res; } +SmallVector getMixedValues(ArrayRef staticValues, + ValueRange dynamicValues, Builder &b) { + return getMixedValues(staticValues, dynamicValues, b.getContext()); +} /// Decompose a vector of mixed static or dynamic values into the corresponding /// pair of arrays. This is the inverse function of `getMixedValues`. diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp index 3eb401c449980..6b5e103cd36c2 100644 --- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp +++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp @@ -48,14 +48,6 @@ mlir::reifyResultShapes(OpBuilder &b, Operation *op, assert(shapedType.getRank() == static_cast(reifiedReturnShapes[resultIdx].size()) && "incorrect implementation of ReifyRankedShapedTypeOpInterface"); - for (int64_t dim = 0; dim < shapedType.getRank(); ++dim) { - // reifyResultShapes must return: - // * Attribute for static dimensions - // * Value for dynamic dimensions - assert(shapedType.isDynamicDim(dim) == - isa(reifiedReturnShapes[resultIdx][dim]) && - "incorrect implementation of ReifyRankedShapedTypeOpInterface"); - } ++resultIdx; } // Assert that every shaped value result was reified. diff --git a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir index 8fb84248c9613..3bc1f56d816d7 100644 --- a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir +++ b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir @@ -210,15 +210,12 @@ func.func @dim_reshape_expansion(%arg0 : tensor<6x5x?xf32>, %sz0: index) -> (ind %3 = tensor.dim %0, %c4 : tensor<2x3x5x4x?x7xf32> return %1, %2, %3 : index, index, index } -// CHECK: #[[MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 28)> // CHECK: func @dim_reshape_expansion // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<6x5x?xf32> -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-SAME: %[[ARG1:.+]]: index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C2]] -// CHECK: %[[D1:.+]] = affine.apply #[[MAP]]()[%[[D0]]] -// CHECK: return %[[C3]], %[[C4]], %[[D1]] +// CHECK: return %[[C3]], %[[C4]], %[[ARG1]] // ----- diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir index 65ceb4ff3e3df..850bbcee34020 100644 --- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir +++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir @@ -10,7 +10,6 @@ module attributes {transform.with_named_sequence} { } } -// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 28)> // CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 * 28)> func.func @empty_reshape_expansion(%arg0 : index, %sz0: index) -> tensor<2x3x5x4x?x7xf32> { @@ -19,11 +18,9 @@ func.func @empty_reshape_expansion(%arg0 : index, %sz0: index) -> tensor<2x3x5x4 return %1 : tensor<2x3x5x4x?x7xf32> } // CHECK-LABEL: func @empty_reshape_expansion -// CHECK-SAME: %[[ARG0:.+]]: index -// CHECK: %[[OLD_INIT:.+]] = tensor.empty(%{{.*}}) : tensor<6x5x?xf32> -// CHECK-NEXT: %[[DIM:.*]] = tensor.dim %[[OLD_INIT]] -// CHECK-NEXT: %[[D:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]] -// CHECK-NEXT: %[[INIT:.+]] = tensor.empty(%[[D]]) +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index +// CHECK-NEXT: %[[INIT:.+]] = tensor.empty(%[[ARG1]]) // CHECK-NEXT: return %[[INIT]] func.func @empty_reshape_collapse(%arg0 : index) -> tensor<6x5x?xf32> { From 62340ff8d844fc02cd1bd34ff6235f1f0e1e464f Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Mon, 27 Jan 2025 10:12:20 -0500 Subject: [PATCH 204/432] [AMDGPU][True16][MC] true16 for v_cmpx_xx_f16 (#123419) A bulk commit of true16 support for v_cmpx_xx_f16 instructions including: v_cmpx_f_f16 v_cmpx_le_f16 v_cmpx_gt_f16 v_cmpx_lg_f16 v_cmpx_ge_f16 v_cmpx_o_f16 v_cmpx_u_f16 v_cmpx_nge_f16 v_cmpx_nlg_f16 v_cmpx_ngt_f16 v_cmpx_nle_f16 v_cmpx_neq_f16 v_cmpx_nlt_f16 v_cmpx_t_f16 v_cmpx_eq_f16 is not in this patch and will be added in the following patch --- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 28 +- .../AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s | 966 +++++++------ .../AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s | 366 +++-- .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s | 204 ++- llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s | 1113 +++++++++------ llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s | 963 +++++++------ llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s | 307 ++-- llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s | 582 +++++--- .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s | 630 ++++++--- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s | 168 ++- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s | 876 +++++++----- llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s | 396 ++++-- llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s | 864 +++++++----- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s | 744 +++++----- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s | 216 ++- llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s | 504 +++++-- .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s | 504 +++++-- .../gfx11_dasm_vop3_dpp16_from_vopcx.txt | 756 +++++++--- .../gfx11_dasm_vop3_dpp8_from_vopcx.txt | 336 ++++- .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt | 196 ++- .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt | 910 +++++++++--- .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt | 756 +++++++--- .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt | 336 ++++- .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt | 179 ++- .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt | 1248 ++++++++++++++--- .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt | 341 ++++- .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt | 732 +++++++--- .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt | 600 +++++--- .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt | 168 ++- 29 files changed, 11102 insertions(+), 4887 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 46cad585b8a82..e16ac4423265e 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1975,22 +1975,22 @@ defm V_CMP_CLASS_F16 : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x07d, "v_cmp_cl defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>; defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; -defm V_CMPX_F_F16_fake16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; +defm V_CMPX_F_F16 : VOPCX_Real_t16_and_fake16_gfx11<0x080, "v_cmpx_f_f16">; defm V_CMPX_LT_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; defm V_CMPX_EQ_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; -defm V_CMPX_LE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; -defm V_CMPX_GT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; -defm V_CMPX_LG_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; -defm V_CMPX_GE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; -defm V_CMPX_O_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; -defm V_CMPX_U_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; -defm V_CMPX_NGE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; -defm V_CMPX_NLG_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; -defm V_CMPX_NGT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; -defm V_CMPX_NLE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; -defm V_CMPX_NEQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; -defm V_CMPX_NLT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; -defm V_CMPX_T_F16_fake16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_fake16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; +defm V_CMPX_LE_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; +defm V_CMPX_GT_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; +defm V_CMPX_LG_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; +defm V_CMPX_GE_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; +defm V_CMPX_O_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; +defm V_CMPX_U_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; +defm V_CMPX_NGE_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; +defm V_CMPX_NLG_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; +defm V_CMPX_NLE_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16 : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; +defm V_CMPX_T_F16 : VOPCX_Real_t16_and_fake16_gfx11<0x08f, "v_cmpx_t_f16", "V_CMPX_TRU_F16", "v_cmpx_tru_f16">; defm V_CMPX_F_F32 : VOPCX_Real_gfx11<0x090>; defm V_CMPX_LT_F32 : VOPCX_Real_gfx11_gfx12<0x091>; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s index e946097f35e23..6864ce20499cb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s @@ -375,47 +375,56 @@ v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_f_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -543,47 +552,56 @@ v_cmpx_f_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -813,47 +831,56 @@ v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1083,47 +1110,56 @@ v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1353,47 +1389,56 @@ v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1902,47 +1947,56 @@ v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1986,47 +2040,56 @@ v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2070,47 +2133,56 @@ v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2154,47 +2226,56 @@ v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2238,47 +2319,56 @@ v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2322,47 +2412,56 @@ v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2406,47 +2505,56 @@ v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2490,47 +2598,47 @@ v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2658,47 +2766,56 @@ v_cmpx_t_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_tru_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_tru_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_tru_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_tru_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_tru_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_tru_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_tru_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_tru_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_tru_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_tru_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2742,47 +2859,56 @@ v_cmpx_tru_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_tru_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] + +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_mirror -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s index e60406078f745..0e36812c78dc1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s @@ -116,17 +116,26 @@ v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_f_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x80,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x80,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x80,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x80,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -158,17 +167,26 @@ v_cmpx_f_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc8,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -236,17 +254,26 @@ v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -314,17 +341,26 @@ v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -392,17 +428,26 @@ v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -557,17 +602,26 @@ v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -581,17 +635,26 @@ v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -605,17 +668,26 @@ v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x99,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -629,17 +701,26 @@ v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -653,17 +734,26 @@ v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -677,17 +767,26 @@ v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -701,17 +800,26 @@ v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -725,17 +833,17 @@ v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x97,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -767,17 +875,26 @@ v_cmpx_t_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcf,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_tru_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_tru_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_tru_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_tru_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_tru_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_tru_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_tru_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_tru_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_tru_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -791,17 +908,26 @@ v_cmpx_tru_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_tru_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s index 799a8f86a01e9..a4340919ca6d2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s @@ -545,11 +545,11 @@ v_cmpx_eq_u64_e64 src_scc, exec v_cmpx_eq_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_eq_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_f_f16_e64 v1, v2 -// GFX11: v_cmpx_f_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_f_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_f_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_f_f16_e64 v255, v255 -// GFX11: v_cmpx_f_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_f_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_f_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] v_cmpx_f_f16_e64 s1, s2 // GFX11: v_cmpx_f_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00] @@ -590,6 +590,12 @@ v_cmpx_f_f16_e64 -src_scc, |vcc_lo| v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_f_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_f_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_f_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_f_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_f_f32_e64 v1, v2 // GFX11: v_cmpx_f_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00] @@ -833,11 +839,11 @@ v_cmpx_f_u64_e64 src_scc, exec v_cmpx_f_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_f_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ge_f16_e64 v1, v2 -// GFX11: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ge_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_ge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ge_f16_e64 v255, v255 -// GFX11: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ge_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_ge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_f16_e64 s1, s2 // GFX11: v_cmpx_ge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00] @@ -878,6 +884,12 @@ v_cmpx_ge_f16_e64 -src_scc, |vcc_lo| v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_ge_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ge_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_ge_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ge_f32_e64 v1, v2 // GFX11: v_cmpx_ge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00] @@ -1223,11 +1235,11 @@ v_cmpx_ge_u64_e64 src_scc, exec v_cmpx_ge_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_ge_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_gt_f16_e64 v1, v2 -// GFX11: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_gt_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_gt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_gt_f16_e64 v255, v255 -// GFX11: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_gt_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_gt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_f16_e64 s1, s2 // GFX11: v_cmpx_gt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00] @@ -1268,6 +1280,12 @@ v_cmpx_gt_f16_e64 -src_scc, |vcc_lo| v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_gt_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_gt_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_gt_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_gt_f32_e64 v1, v2 // GFX11: v_cmpx_gt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00] @@ -1613,11 +1631,11 @@ v_cmpx_gt_u64_e64 src_scc, exec v_cmpx_gt_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_gt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_le_f16_e64 v1, v2 -// GFX11: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_le_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_le_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_le_f16_e64 v255, v255 -// GFX11: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_le_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_le_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_f16_e64 s1, s2 // GFX11: v_cmpx_le_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00] @@ -1658,6 +1676,12 @@ v_cmpx_le_f16_e64 -src_scc, |vcc_lo| v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_le_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_le_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_le_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_le_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_le_f32_e64 v1, v2 // GFX11: v_cmpx_le_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00] @@ -2003,11 +2027,11 @@ v_cmpx_le_u64_e64 src_scc, exec v_cmpx_le_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_le_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_lg_f16_e64 v1, v2 -// GFX11: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_lg_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_lg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_lg_f16_e64 v255, v255 -// GFX11: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_lg_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_lg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lg_f16_e64 s1, s2 // GFX11: v_cmpx_lg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00] @@ -2048,6 +2072,12 @@ v_cmpx_lg_f16_e64 -src_scc, |vcc_lo| v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_lg_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_lg_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_lg_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_lg_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_lg_f32_e64 v1, v2 // GFX11: v_cmpx_lg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00] @@ -2789,11 +2819,11 @@ v_cmpx_ne_u64_e64 src_scc, exec v_cmpx_ne_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_ne_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_neq_f16_e64 v1, v2 -// GFX11: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_neq_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_neq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_neq_f16_e64 v255, v255 -// GFX11: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_neq_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_neq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] v_cmpx_neq_f16_e64 s1, s2 // GFX11: v_cmpx_neq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00] @@ -2834,6 +2864,12 @@ v_cmpx_neq_f16_e64 -src_scc, |vcc_lo| v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_neq_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_neq_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_neq_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_neq_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_neq_f32_e64 v1, v2 // GFX11: v_cmpx_neq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00] @@ -2915,11 +2951,11 @@ v_cmpx_neq_f64_e64 -|src_scc|, -|exec| v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nge_f16_e64 v1, v2 -// GFX11: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nge_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_nge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nge_f16_e64 v255, v255 -// GFX11: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nge_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_nge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nge_f16_e64 s1, s2 // GFX11: v_cmpx_nge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00] @@ -2960,6 +2996,12 @@ v_cmpx_nge_f16_e64 -src_scc, |vcc_lo| v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nge_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_nge_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nge_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_nge_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nge_f32_e64 v1, v2 // GFX11: v_cmpx_nge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00] @@ -3041,11 +3083,11 @@ v_cmpx_nge_f64_e64 -|src_scc|, -|exec| v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_ngt_f16_e64 v1, v2 -// GFX11: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ngt_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_ngt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ngt_f16_e64 v255, v255 -// GFX11: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ngt_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_ngt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ngt_f16_e64 s1, s2 // GFX11: v_cmpx_ngt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00] @@ -3086,6 +3128,12 @@ v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo| v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_ngt_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_ngt_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ngt_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_ngt_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ngt_f32_e64 v1, v2 // GFX11: v_cmpx_ngt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00] @@ -3167,11 +3215,11 @@ v_cmpx_ngt_f64_e64 -|src_scc|, -|exec| v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nle_f16_e64 v1, v2 -// GFX11: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nle_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_nle_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nle_f16_e64 v255, v255 -// GFX11: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nle_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_nle_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nle_f16_e64 s1, s2 // GFX11: v_cmpx_nle_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00] @@ -3212,6 +3260,12 @@ v_cmpx_nle_f16_e64 -src_scc, |vcc_lo| v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nle_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_nle_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nle_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_nle_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nle_f32_e64 v1, v2 // GFX11: v_cmpx_nle_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00] @@ -3293,11 +3347,11 @@ v_cmpx_nle_f64_e64 -|src_scc|, -|exec| v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nlg_f16_e64 v1, v2 -// GFX11: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nlg_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_nlg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nlg_f16_e64 v255, v255 -// GFX11: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nlg_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_nlg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlg_f16_e64 s1, s2 // GFX11: v_cmpx_nlg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00] @@ -3338,6 +3392,12 @@ v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo| v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nlg_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_nlg_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nlg_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_nlg_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nlg_f32_e64 v1, v2 // GFX11: v_cmpx_nlg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00] @@ -3419,11 +3479,11 @@ v_cmpx_nlg_f64_e64 -|src_scc|, -|exec| v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nlt_f16_e64 v1, v2 -// GFX11: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nlt_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_nlt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nlt_f16_e64 v255, v255 -// GFX11: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nlt_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_nlt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlt_f16_e64 s1, s2 // GFX11: v_cmpx_nlt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00] @@ -3464,6 +3524,12 @@ v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo| v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nlt_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_nlt_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nlt_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_nlt_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nlt_f32_e64 v1, v2 // GFX11: v_cmpx_nlt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00] @@ -3545,11 +3611,11 @@ v_cmpx_nlt_f64_e64 -|src_scc|, -|exec| v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_o_f16_e64 v1, v2 -// GFX11: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_o_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_o_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_o_f16_e64 v255, v255 -// GFX11: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_o_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_o_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] v_cmpx_o_f16_e64 s1, s2 // GFX11: v_cmpx_o_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00] @@ -3590,6 +3656,12 @@ v_cmpx_o_f16_e64 -src_scc, |vcc_lo| v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_o_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_o_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_o_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_o_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_o_f32_e64 v1, v2 // GFX11: v_cmpx_o_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00] @@ -3671,11 +3743,11 @@ v_cmpx_o_f64_e64 -|src_scc|, -|exec| v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_t_f16_e64 v1, v2 -// GFX11: v_cmpx_t_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_t_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_t_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_t_f16_e64 v255, v255 -// GFX11: v_cmpx_t_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_t_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_t_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] v_cmpx_t_f16_e64 s1, s2 // GFX11: v_cmpx_t_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00] @@ -3959,11 +4031,11 @@ v_cmpx_t_u64_e64 src_scc, exec v_cmpx_t_u64_e64 0xaf123456, vcc // GFX11: v_cmpx_t_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_tru_f16_e64 v1, v2 -// GFX11: v_cmpx_t_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_tru_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_t_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_tru_f16_e64 v255, v255 -// GFX11: v_cmpx_t_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_tru_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_t_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] v_cmpx_tru_f16_e64 s1, s2 // GFX11: v_cmpx_t_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00] @@ -4004,6 +4076,12 @@ v_cmpx_tru_f16_e64 -src_scc, |vcc_lo| v_cmpx_tru_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_t_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_t_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_t_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_t_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_tru_f32_e64 v1, v2 // GFX11: v_cmpx_t_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00] @@ -4085,11 +4163,11 @@ v_cmpx_tru_f64_e64 -|src_scc|, -|exec| v_cmpx_tru_f64_e64 0xaf123456, -|vcc| clamp // GFX11: v_cmpx_t_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaf,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_u_f16_e64 v1, v2 -// GFX11: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_u_f16_e64 v1.l, v2.l +// GFX11: v_cmpx_u_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_u_f16_e64 v255, v255 -// GFX11: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_u_f16_e64 v255.l, v255.l +// GFX11: v_cmpx_u_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] v_cmpx_u_f16_e64 s1, s2 // GFX11: v_cmpx_u_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00] @@ -4130,6 +4208,12 @@ v_cmpx_u_f16_e64 -src_scc, |vcc_lo| v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_u_f16_e64 v1.h, v2.l +// GFX11: v_cmpx_u_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_u_f16_e64 v255.l, v255.h +// GFX11: v_cmpx_u_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_u_f32_e64 v1, v2 // GFX11: v_cmpx_u_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s index 88d9fb6cc1357..98aba2b960ad9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s @@ -566,50 +566,65 @@ v_cmpx_eq_u64 src_scc, v[2:3] v_cmpx_eq_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_f_f16 v1, v2 -// GFX11: v_cmpx_f_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x00,0x7d] +v_cmpx_f_f16 v1.l, v2.l +// GFX11: v_cmpx_f_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x00,0x7d] -v_cmpx_f_f16 v127, v2 -// GFX11: v_cmpx_f_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x00,0x7d] +v_cmpx_f_f16 v127.l, v2.l +// GFX11: v_cmpx_f_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x00,0x7d] -v_cmpx_f_f16 s1, v2 -// GFX11: v_cmpx_f_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x00,0x7d] +v_cmpx_f_f16 s1, v2.l +// GFX11: v_cmpx_f_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x00,0x7d] -v_cmpx_f_f16 s105, v2 -// GFX11: v_cmpx_f_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x00,0x7d] +v_cmpx_f_f16 s105, v2.l +// GFX11: v_cmpx_f_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x00,0x7d] -v_cmpx_f_f16 vcc_lo, v2 -// GFX11: v_cmpx_f_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x00,0x7d] +v_cmpx_f_f16 vcc_lo, v2.l +// GFX11: v_cmpx_f_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x00,0x7d] -v_cmpx_f_f16 vcc_hi, v2 -// GFX11: v_cmpx_f_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x00,0x7d] +v_cmpx_f_f16 vcc_hi, v2.l +// GFX11: v_cmpx_f_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x00,0x7d] -v_cmpx_f_f16 ttmp15, v2 -// GFX11: v_cmpx_f_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x00,0x7d] +v_cmpx_f_f16 ttmp15, v2.l +// GFX11: v_cmpx_f_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x00,0x7d] -v_cmpx_f_f16 m0, v2 -// GFX11: v_cmpx_f_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x00,0x7d] +v_cmpx_f_f16 m0, v2.l +// GFX11: v_cmpx_f_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x00,0x7d] -v_cmpx_f_f16 exec_lo, v2 -// GFX11: v_cmpx_f_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x00,0x7d] +v_cmpx_f_f16 exec_lo, v2.l +// GFX11: v_cmpx_f_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x00,0x7d] -v_cmpx_f_f16 exec_hi, v2 -// GFX11: v_cmpx_f_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x00,0x7d] +v_cmpx_f_f16 exec_hi, v2.l +// GFX11: v_cmpx_f_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x00,0x7d] -v_cmpx_f_f16 null, v2 -// GFX11: v_cmpx_f_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x00,0x7d] +v_cmpx_f_f16 null, v2.l +// GFX11: v_cmpx_f_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x00,0x7d] -v_cmpx_f_f16 -1, v2 -// GFX11: v_cmpx_f_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x00,0x7d] +v_cmpx_f_f16 -1, v2.l +// GFX11: v_cmpx_f_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x00,0x7d] -v_cmpx_f_f16 0.5, v2 -// GFX11: v_cmpx_f_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x00,0x7d] +v_cmpx_f_f16 0.5, v2.l +// GFX11: v_cmpx_f_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x00,0x7d] -v_cmpx_f_f16 src_scc, v2 -// GFX11: v_cmpx_f_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x00,0x7d] +v_cmpx_f_f16 src_scc, v2.l +// GFX11: v_cmpx_f_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x00,0x7d] -v_cmpx_f_f16 0xfe0b, v127 -// GFX11: v_cmpx_f_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_f_f16 0xfe0b, v127.l +// GFX11: v_cmpx_f_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_f_f16 v1.h, v2.l +// GFX11: v_cmpx_f_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x00,0x7d] + +v_cmpx_f_f16 v127.h, v2.l +// GFX11: v_cmpx_f_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x00,0x7d] + +v_cmpx_f_f16 0.5, v127.l +// GFX11: v_cmpx_f_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x00,0x7d] + +v_cmpx_f_f16 src_scc, v2.h +// GFX11: v_cmpx_f_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x01,0x7d] + +v_cmpx_f_f16 0xfe0b, v127.h +// GFX11: v_cmpx_f_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_f_f32 v1, v2 // GFX11: v_cmpx_f_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x20,0x7d] @@ -854,50 +869,65 @@ v_cmpx_f_u64 src_scc, v[2:3] v_cmpx_f_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_f_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ge_f16 v1, v2 -// GFX11: v_cmpx_ge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0c,0x7d] +v_cmpx_ge_f16 v1.l, v2.l +// GFX11: v_cmpx_ge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0c,0x7d] + +v_cmpx_ge_f16 v127.l, v2.l +// GFX11: v_cmpx_ge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0c,0x7d] + +v_cmpx_ge_f16 s1, v2.l +// GFX11: v_cmpx_ge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0c,0x7d] + +v_cmpx_ge_f16 s105, v2.l +// GFX11: v_cmpx_ge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0c,0x7d] + +v_cmpx_ge_f16 vcc_lo, v2.l +// GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0c,0x7d] + +v_cmpx_ge_f16 vcc_hi, v2.l +// GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0c,0x7d] -v_cmpx_ge_f16 v127, v2 -// GFX11: v_cmpx_ge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0c,0x7d] +v_cmpx_ge_f16 ttmp15, v2.l +// GFX11: v_cmpx_ge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0c,0x7d] -v_cmpx_ge_f16 s1, v2 -// GFX11: v_cmpx_ge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0c,0x7d] +v_cmpx_ge_f16 m0, v2.l +// GFX11: v_cmpx_ge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0c,0x7d] -v_cmpx_ge_f16 s105, v2 -// GFX11: v_cmpx_ge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0c,0x7d] +v_cmpx_ge_f16 exec_lo, v2.l +// GFX11: v_cmpx_ge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0c,0x7d] -v_cmpx_ge_f16 vcc_lo, v2 -// GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0c,0x7d] +v_cmpx_ge_f16 exec_hi, v2.l +// GFX11: v_cmpx_ge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0c,0x7d] -v_cmpx_ge_f16 vcc_hi, v2 -// GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0c,0x7d] +v_cmpx_ge_f16 null, v2.l +// GFX11: v_cmpx_ge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0c,0x7d] -v_cmpx_ge_f16 ttmp15, v2 -// GFX11: v_cmpx_ge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0c,0x7d] +v_cmpx_ge_f16 -1, v2.l +// GFX11: v_cmpx_ge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0c,0x7d] -v_cmpx_ge_f16 m0, v2 -// GFX11: v_cmpx_ge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0c,0x7d] +v_cmpx_ge_f16 0.5, v2.l +// GFX11: v_cmpx_ge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0c,0x7d] -v_cmpx_ge_f16 exec_lo, v2 -// GFX11: v_cmpx_ge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0c,0x7d] +v_cmpx_ge_f16 src_scc, v2.l +// GFX11: v_cmpx_ge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0c,0x7d] -v_cmpx_ge_f16 exec_hi, v2 -// GFX11: v_cmpx_ge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0c,0x7d] +v_cmpx_ge_f16 0xfe0b, v127.l +// GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ge_f16 null, v2 -// GFX11: v_cmpx_ge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0c,0x7d] +v_cmpx_ge_f16 v1.h, v2.l +// GFX11: v_cmpx_ge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0c,0x7d] -v_cmpx_ge_f16 -1, v2 -// GFX11: v_cmpx_ge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0c,0x7d] +v_cmpx_ge_f16 v127.h, v2.l +// GFX11: v_cmpx_ge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0c,0x7d] -v_cmpx_ge_f16 0.5, v2 -// GFX11: v_cmpx_ge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0c,0x7d] +v_cmpx_ge_f16 0.5, v127.l +// GFX11: v_cmpx_ge_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x0c,0x7d] -v_cmpx_ge_f16 src_scc, v2 -// GFX11: v_cmpx_ge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0c,0x7d] +v_cmpx_ge_f16 src_scc, v2.h +// GFX11: v_cmpx_ge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0d,0x7d] -v_cmpx_ge_f16 0xfe0b, v127 -// GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_f16 0xfe0b, v127.h +// GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ge_f32 v1, v2 // GFX11: v_cmpx_ge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2c,0x7d] @@ -1262,50 +1292,65 @@ v_cmpx_ge_u64 src_scc, v[2:3] v_cmpx_ge_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_gt_f16 v1, v2 -// GFX11: v_cmpx_gt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x08,0x7d] +v_cmpx_gt_f16 v1.l, v2.l +// GFX11: v_cmpx_gt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x08,0x7d] -v_cmpx_gt_f16 v127, v2 -// GFX11: v_cmpx_gt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x08,0x7d] +v_cmpx_gt_f16 v127.l, v2.l +// GFX11: v_cmpx_gt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x08,0x7d] -v_cmpx_gt_f16 s1, v2 -// GFX11: v_cmpx_gt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x08,0x7d] +v_cmpx_gt_f16 s1, v2.l +// GFX11: v_cmpx_gt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x08,0x7d] -v_cmpx_gt_f16 s105, v2 -// GFX11: v_cmpx_gt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x08,0x7d] +v_cmpx_gt_f16 s105, v2.l +// GFX11: v_cmpx_gt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x08,0x7d] -v_cmpx_gt_f16 vcc_lo, v2 -// GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x08,0x7d] +v_cmpx_gt_f16 vcc_lo, v2.l +// GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x08,0x7d] -v_cmpx_gt_f16 vcc_hi, v2 -// GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x08,0x7d] +v_cmpx_gt_f16 vcc_hi, v2.l +// GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x08,0x7d] -v_cmpx_gt_f16 ttmp15, v2 -// GFX11: v_cmpx_gt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x08,0x7d] +v_cmpx_gt_f16 ttmp15, v2.l +// GFX11: v_cmpx_gt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x08,0x7d] -v_cmpx_gt_f16 m0, v2 -// GFX11: v_cmpx_gt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x08,0x7d] +v_cmpx_gt_f16 m0, v2.l +// GFX11: v_cmpx_gt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x08,0x7d] -v_cmpx_gt_f16 exec_lo, v2 -// GFX11: v_cmpx_gt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x08,0x7d] +v_cmpx_gt_f16 exec_lo, v2.l +// GFX11: v_cmpx_gt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x08,0x7d] -v_cmpx_gt_f16 exec_hi, v2 -// GFX11: v_cmpx_gt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x08,0x7d] +v_cmpx_gt_f16 exec_hi, v2.l +// GFX11: v_cmpx_gt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x08,0x7d] -v_cmpx_gt_f16 null, v2 -// GFX11: v_cmpx_gt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x08,0x7d] +v_cmpx_gt_f16 null, v2.l +// GFX11: v_cmpx_gt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x08,0x7d] -v_cmpx_gt_f16 -1, v2 -// GFX11: v_cmpx_gt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x08,0x7d] +v_cmpx_gt_f16 -1, v2.l +// GFX11: v_cmpx_gt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x08,0x7d] -v_cmpx_gt_f16 0.5, v2 -// GFX11: v_cmpx_gt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x08,0x7d] +v_cmpx_gt_f16 0.5, v2.l +// GFX11: v_cmpx_gt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x08,0x7d] -v_cmpx_gt_f16 src_scc, v2 -// GFX11: v_cmpx_gt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x08,0x7d] +v_cmpx_gt_f16 src_scc, v2.l +// GFX11: v_cmpx_gt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x08,0x7d] -v_cmpx_gt_f16 0xfe0b, v127 -// GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_f16 0xfe0b, v127.l +// GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_gt_f16 v1.h, v2.l +// GFX11: v_cmpx_gt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x08,0x7d] + +v_cmpx_gt_f16 v127.h, v2.l +// GFX11: v_cmpx_gt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x08,0x7d] + +v_cmpx_gt_f16 0.5, v127.l +// GFX11: v_cmpx_gt_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x08,0x7d] + +v_cmpx_gt_f16 src_scc, v2.h +// GFX11: v_cmpx_gt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x09,0x7d] + +v_cmpx_gt_f16 0xfe0b, v127.h +// GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_gt_f32 v1, v2 // GFX11: v_cmpx_gt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x28,0x7d] @@ -1670,50 +1715,65 @@ v_cmpx_gt_u64 src_scc, v[2:3] v_cmpx_gt_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_le_f16 v1, v2 -// GFX11: v_cmpx_le_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x06,0x7d] +v_cmpx_le_f16 v1.l, v2.l +// GFX11: v_cmpx_le_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x06,0x7d] + +v_cmpx_le_f16 v127.l, v2.l +// GFX11: v_cmpx_le_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x06,0x7d] + +v_cmpx_le_f16 s1, v2.l +// GFX11: v_cmpx_le_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x06,0x7d] + +v_cmpx_le_f16 s105, v2.l +// GFX11: v_cmpx_le_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x06,0x7d] + +v_cmpx_le_f16 vcc_lo, v2.l +// GFX11: v_cmpx_le_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x06,0x7d] + +v_cmpx_le_f16 vcc_hi, v2.l +// GFX11: v_cmpx_le_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x06,0x7d] -v_cmpx_le_f16 v127, v2 -// GFX11: v_cmpx_le_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x06,0x7d] +v_cmpx_le_f16 ttmp15, v2.l +// GFX11: v_cmpx_le_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x06,0x7d] -v_cmpx_le_f16 s1, v2 -// GFX11: v_cmpx_le_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x06,0x7d] +v_cmpx_le_f16 m0, v2.l +// GFX11: v_cmpx_le_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x06,0x7d] -v_cmpx_le_f16 s105, v2 -// GFX11: v_cmpx_le_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x06,0x7d] +v_cmpx_le_f16 exec_lo, v2.l +// GFX11: v_cmpx_le_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x06,0x7d] -v_cmpx_le_f16 vcc_lo, v2 -// GFX11: v_cmpx_le_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x06,0x7d] +v_cmpx_le_f16 exec_hi, v2.l +// GFX11: v_cmpx_le_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x06,0x7d] -v_cmpx_le_f16 vcc_hi, v2 -// GFX11: v_cmpx_le_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x06,0x7d] +v_cmpx_le_f16 null, v2.l +// GFX11: v_cmpx_le_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x06,0x7d] -v_cmpx_le_f16 ttmp15, v2 -// GFX11: v_cmpx_le_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x06,0x7d] +v_cmpx_le_f16 -1, v2.l +// GFX11: v_cmpx_le_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x06,0x7d] -v_cmpx_le_f16 m0, v2 -// GFX11: v_cmpx_le_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x06,0x7d] +v_cmpx_le_f16 0.5, v2.l +// GFX11: v_cmpx_le_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x06,0x7d] -v_cmpx_le_f16 exec_lo, v2 -// GFX11: v_cmpx_le_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x06,0x7d] +v_cmpx_le_f16 src_scc, v2.l +// GFX11: v_cmpx_le_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x06,0x7d] -v_cmpx_le_f16 exec_hi, v2 -// GFX11: v_cmpx_le_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x06,0x7d] +v_cmpx_le_f16 0xfe0b, v127.l +// GFX11: v_cmpx_le_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_le_f16 null, v2 -// GFX11: v_cmpx_le_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x06,0x7d] +v_cmpx_le_f16 v1.h, v2.l +// GFX11: v_cmpx_le_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x06,0x7d] -v_cmpx_le_f16 -1, v2 -// GFX11: v_cmpx_le_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x06,0x7d] +v_cmpx_le_f16 v127.h, v2.l +// GFX11: v_cmpx_le_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x06,0x7d] -v_cmpx_le_f16 0.5, v2 -// GFX11: v_cmpx_le_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x06,0x7d] +v_cmpx_le_f16 0.5, v127.l +// GFX11: v_cmpx_le_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x06,0x7d] -v_cmpx_le_f16 src_scc, v2 -// GFX11: v_cmpx_le_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x06,0x7d] +v_cmpx_le_f16 src_scc, v2.h +// GFX11: v_cmpx_le_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x07,0x7d] -v_cmpx_le_f16 0xfe0b, v127 -// GFX11: v_cmpx_le_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_le_f16 0xfe0b, v127.h +// GFX11: v_cmpx_le_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_le_f32 v1, v2 // GFX11: v_cmpx_le_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x26,0x7d] @@ -2078,50 +2138,65 @@ v_cmpx_le_u64 src_scc, v[2:3] v_cmpx_le_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_lg_f16 v1, v2 -// GFX11: v_cmpx_lg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0a,0x7d] +v_cmpx_lg_f16 v1.l, v2.l +// GFX11: v_cmpx_lg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x7d] -v_cmpx_lg_f16 v127, v2 -// GFX11: v_cmpx_lg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0a,0x7d] +v_cmpx_lg_f16 v127.l, v2.l +// GFX11: v_cmpx_lg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x7d] -v_cmpx_lg_f16 s1, v2 -// GFX11: v_cmpx_lg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0a,0x7d] +v_cmpx_lg_f16 s1, v2.l +// GFX11: v_cmpx_lg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0a,0x7d] -v_cmpx_lg_f16 s105, v2 -// GFX11: v_cmpx_lg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0a,0x7d] +v_cmpx_lg_f16 s105, v2.l +// GFX11: v_cmpx_lg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0a,0x7d] -v_cmpx_lg_f16 vcc_lo, v2 -// GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x7d] +v_cmpx_lg_f16 vcc_lo, v2.l +// GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x7d] -v_cmpx_lg_f16 vcc_hi, v2 -// GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x7d] +v_cmpx_lg_f16 vcc_hi, v2.l +// GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x7d] -v_cmpx_lg_f16 ttmp15, v2 -// GFX11: v_cmpx_lg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x7d] +v_cmpx_lg_f16 ttmp15, v2.l +// GFX11: v_cmpx_lg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x7d] -v_cmpx_lg_f16 m0, v2 -// GFX11: v_cmpx_lg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0a,0x7d] +v_cmpx_lg_f16 m0, v2.l +// GFX11: v_cmpx_lg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x7d] -v_cmpx_lg_f16 exec_lo, v2 -// GFX11: v_cmpx_lg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x7d] +v_cmpx_lg_f16 exec_lo, v2.l +// GFX11: v_cmpx_lg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x7d] -v_cmpx_lg_f16 exec_hi, v2 -// GFX11: v_cmpx_lg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x7d] +v_cmpx_lg_f16 exec_hi, v2.l +// GFX11: v_cmpx_lg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x7d] -v_cmpx_lg_f16 null, v2 -// GFX11: v_cmpx_lg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0a,0x7d] +v_cmpx_lg_f16 null, v2.l +// GFX11: v_cmpx_lg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0a,0x7d] -v_cmpx_lg_f16 -1, v2 -// GFX11: v_cmpx_lg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0a,0x7d] +v_cmpx_lg_f16 -1, v2.l +// GFX11: v_cmpx_lg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x7d] -v_cmpx_lg_f16 0.5, v2 -// GFX11: v_cmpx_lg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x7d] +v_cmpx_lg_f16 0.5, v2.l +// GFX11: v_cmpx_lg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x7d] -v_cmpx_lg_f16 src_scc, v2 -// GFX11: v_cmpx_lg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x7d] +v_cmpx_lg_f16 src_scc, v2.l +// GFX11: v_cmpx_lg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x7d] -v_cmpx_lg_f16 0xfe0b, v127 -// GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_lg_f16 0xfe0b, v127.l +// GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_lg_f16 v1.h, v2.l +// GFX11: v_cmpx_lg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x7d] + +v_cmpx_lg_f16 v127.h, v2.l +// GFX11: v_cmpx_lg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x7d] + +v_cmpx_lg_f16 0.5, v127.l +// GFX11: v_cmpx_lg_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x0a,0x7d] + +v_cmpx_lg_f16 src_scc, v2.h +// GFX11: v_cmpx_lg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x7d] + +v_cmpx_lg_f16 0xfe0b, v127.h +// GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_lg_f32 v1, v2 // GFX11: v_cmpx_lg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2a,0x7d] @@ -2909,50 +2984,65 @@ v_cmpx_ne_u64 src_scc, v[2:3] v_cmpx_ne_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_neq_f16 v1, v2 -// GFX11: v_cmpx_neq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1a,0x7d] +v_cmpx_neq_f16 v1.l, v2.l +// GFX11: v_cmpx_neq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1a,0x7d] + +v_cmpx_neq_f16 v127.l, v2.l +// GFX11: v_cmpx_neq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1a,0x7d] + +v_cmpx_neq_f16 s1, v2.l +// GFX11: v_cmpx_neq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1a,0x7d] + +v_cmpx_neq_f16 s105, v2.l +// GFX11: v_cmpx_neq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1a,0x7d] + +v_cmpx_neq_f16 vcc_lo, v2.l +// GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1a,0x7d] + +v_cmpx_neq_f16 vcc_hi, v2.l +// GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1a,0x7d] -v_cmpx_neq_f16 v127, v2 -// GFX11: v_cmpx_neq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1a,0x7d] +v_cmpx_neq_f16 ttmp15, v2.l +// GFX11: v_cmpx_neq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1a,0x7d] -v_cmpx_neq_f16 s1, v2 -// GFX11: v_cmpx_neq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1a,0x7d] +v_cmpx_neq_f16 m0, v2.l +// GFX11: v_cmpx_neq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1a,0x7d] -v_cmpx_neq_f16 s105, v2 -// GFX11: v_cmpx_neq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1a,0x7d] +v_cmpx_neq_f16 exec_lo, v2.l +// GFX11: v_cmpx_neq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1a,0x7d] -v_cmpx_neq_f16 vcc_lo, v2 -// GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1a,0x7d] +v_cmpx_neq_f16 exec_hi, v2.l +// GFX11: v_cmpx_neq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1a,0x7d] -v_cmpx_neq_f16 vcc_hi, v2 -// GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1a,0x7d] +v_cmpx_neq_f16 null, v2.l +// GFX11: v_cmpx_neq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1a,0x7d] -v_cmpx_neq_f16 ttmp15, v2 -// GFX11: v_cmpx_neq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1a,0x7d] +v_cmpx_neq_f16 -1, v2.l +// GFX11: v_cmpx_neq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1a,0x7d] -v_cmpx_neq_f16 m0, v2 -// GFX11: v_cmpx_neq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1a,0x7d] +v_cmpx_neq_f16 0.5, v2.l +// GFX11: v_cmpx_neq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1a,0x7d] -v_cmpx_neq_f16 exec_lo, v2 -// GFX11: v_cmpx_neq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1a,0x7d] +v_cmpx_neq_f16 src_scc, v2.l +// GFX11: v_cmpx_neq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1a,0x7d] -v_cmpx_neq_f16 exec_hi, v2 -// GFX11: v_cmpx_neq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1a,0x7d] +v_cmpx_neq_f16 0xfe0b, v127.l +// GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_neq_f16 null, v2 -// GFX11: v_cmpx_neq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1a,0x7d] +v_cmpx_neq_f16 v1.h, v2.l +// GFX11: v_cmpx_neq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1a,0x7d] -v_cmpx_neq_f16 -1, v2 -// GFX11: v_cmpx_neq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1a,0x7d] +v_cmpx_neq_f16 v127.h, v2.l +// GFX11: v_cmpx_neq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1a,0x7d] -v_cmpx_neq_f16 0.5, v2 -// GFX11: v_cmpx_neq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1a,0x7d] +v_cmpx_neq_f16 0.5, v127.l +// GFX11: v_cmpx_neq_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1a,0x7d] -v_cmpx_neq_f16 src_scc, v2 -// GFX11: v_cmpx_neq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1a,0x7d] +v_cmpx_neq_f16 src_scc, v2.h +// GFX11: v_cmpx_neq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1b,0x7d] -v_cmpx_neq_f16 0xfe0b, v127 -// GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_neq_f16 0xfe0b, v127.h +// GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_neq_f32 v1, v2 // GFX11: v_cmpx_neq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3a,0x7d] @@ -3035,50 +3125,65 @@ v_cmpx_neq_f64 src_scc, v[2:3] v_cmpx_neq_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nge_f16 v1, v2 -// GFX11: v_cmpx_nge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x12,0x7d] +v_cmpx_nge_f16 v1.l, v2.l +// GFX11: v_cmpx_nge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x12,0x7d] -v_cmpx_nge_f16 v127, v2 -// GFX11: v_cmpx_nge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x12,0x7d] +v_cmpx_nge_f16 v127.l, v2.l +// GFX11: v_cmpx_nge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x12,0x7d] -v_cmpx_nge_f16 s1, v2 -// GFX11: v_cmpx_nge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x12,0x7d] +v_cmpx_nge_f16 s1, v2.l +// GFX11: v_cmpx_nge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x12,0x7d] -v_cmpx_nge_f16 s105, v2 -// GFX11: v_cmpx_nge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x12,0x7d] +v_cmpx_nge_f16 s105, v2.l +// GFX11: v_cmpx_nge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x12,0x7d] -v_cmpx_nge_f16 vcc_lo, v2 -// GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x12,0x7d] +v_cmpx_nge_f16 vcc_lo, v2.l +// GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x12,0x7d] -v_cmpx_nge_f16 vcc_hi, v2 -// GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x12,0x7d] +v_cmpx_nge_f16 vcc_hi, v2.l +// GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x12,0x7d] -v_cmpx_nge_f16 ttmp15, v2 -// GFX11: v_cmpx_nge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x12,0x7d] +v_cmpx_nge_f16 ttmp15, v2.l +// GFX11: v_cmpx_nge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x12,0x7d] -v_cmpx_nge_f16 m0, v2 -// GFX11: v_cmpx_nge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x12,0x7d] +v_cmpx_nge_f16 m0, v2.l +// GFX11: v_cmpx_nge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x12,0x7d] -v_cmpx_nge_f16 exec_lo, v2 -// GFX11: v_cmpx_nge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x12,0x7d] +v_cmpx_nge_f16 exec_lo, v2.l +// GFX11: v_cmpx_nge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x12,0x7d] -v_cmpx_nge_f16 exec_hi, v2 -// GFX11: v_cmpx_nge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x12,0x7d] +v_cmpx_nge_f16 exec_hi, v2.l +// GFX11: v_cmpx_nge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x12,0x7d] -v_cmpx_nge_f16 null, v2 -// GFX11: v_cmpx_nge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x12,0x7d] +v_cmpx_nge_f16 null, v2.l +// GFX11: v_cmpx_nge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x12,0x7d] -v_cmpx_nge_f16 -1, v2 -// GFX11: v_cmpx_nge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x12,0x7d] +v_cmpx_nge_f16 -1, v2.l +// GFX11: v_cmpx_nge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x12,0x7d] -v_cmpx_nge_f16 0.5, v2 -// GFX11: v_cmpx_nge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x12,0x7d] +v_cmpx_nge_f16 0.5, v2.l +// GFX11: v_cmpx_nge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x12,0x7d] -v_cmpx_nge_f16 src_scc, v2 -// GFX11: v_cmpx_nge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x12,0x7d] +v_cmpx_nge_f16 src_scc, v2.l +// GFX11: v_cmpx_nge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x12,0x7d] -v_cmpx_nge_f16 0xfe0b, v127 -// GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nge_f16 0xfe0b, v127.l +// GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_nge_f16 v1.h, v2.l +// GFX11: v_cmpx_nge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x12,0x7d] + +v_cmpx_nge_f16 v127.h, v2.l +// GFX11: v_cmpx_nge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x12,0x7d] + +v_cmpx_nge_f16 0.5, v127.l +// GFX11: v_cmpx_nge_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x12,0x7d] + +v_cmpx_nge_f16 src_scc, v2.h +// GFX11: v_cmpx_nge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x13,0x7d] + +v_cmpx_nge_f16 0xfe0b, v127.h +// GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nge_f32 v1, v2 // GFX11: v_cmpx_nge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x32,0x7d] @@ -3161,50 +3266,65 @@ v_cmpx_nge_f64 src_scc, v[2:3] v_cmpx_nge_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ngt_f16 v1, v2 -// GFX11: v_cmpx_ngt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x16,0x7d] +v_cmpx_ngt_f16 v1.l, v2.l +// GFX11: v_cmpx_ngt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x16,0x7d] + +v_cmpx_ngt_f16 v127.l, v2.l +// GFX11: v_cmpx_ngt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x16,0x7d] + +v_cmpx_ngt_f16 s1, v2.l +// GFX11: v_cmpx_ngt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x16,0x7d] + +v_cmpx_ngt_f16 s105, v2.l +// GFX11: v_cmpx_ngt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x16,0x7d] + +v_cmpx_ngt_f16 vcc_lo, v2.l +// GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x16,0x7d] -v_cmpx_ngt_f16 v127, v2 -// GFX11: v_cmpx_ngt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x16,0x7d] +v_cmpx_ngt_f16 vcc_hi, v2.l +// GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x16,0x7d] -v_cmpx_ngt_f16 s1, v2 -// GFX11: v_cmpx_ngt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x16,0x7d] +v_cmpx_ngt_f16 ttmp15, v2.l +// GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x16,0x7d] -v_cmpx_ngt_f16 s105, v2 -// GFX11: v_cmpx_ngt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x16,0x7d] +v_cmpx_ngt_f16 m0, v2.l +// GFX11: v_cmpx_ngt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x16,0x7d] -v_cmpx_ngt_f16 vcc_lo, v2 -// GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x16,0x7d] +v_cmpx_ngt_f16 exec_lo, v2.l +// GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x16,0x7d] -v_cmpx_ngt_f16 vcc_hi, v2 -// GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x16,0x7d] +v_cmpx_ngt_f16 exec_hi, v2.l +// GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x16,0x7d] -v_cmpx_ngt_f16 ttmp15, v2 -// GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x16,0x7d] +v_cmpx_ngt_f16 null, v2.l +// GFX11: v_cmpx_ngt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x16,0x7d] -v_cmpx_ngt_f16 m0, v2 -// GFX11: v_cmpx_ngt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x16,0x7d] +v_cmpx_ngt_f16 -1, v2.l +// GFX11: v_cmpx_ngt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x16,0x7d] -v_cmpx_ngt_f16 exec_lo, v2 -// GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x16,0x7d] +v_cmpx_ngt_f16 0.5, v2.l +// GFX11: v_cmpx_ngt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x16,0x7d] -v_cmpx_ngt_f16 exec_hi, v2 -// GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x16,0x7d] +v_cmpx_ngt_f16 src_scc, v2.l +// GFX11: v_cmpx_ngt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x16,0x7d] -v_cmpx_ngt_f16 null, v2 -// GFX11: v_cmpx_ngt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x16,0x7d] +v_cmpx_ngt_f16 0xfe0b, v127.l +// GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_ngt_f16 -1, v2 -// GFX11: v_cmpx_ngt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x16,0x7d] +v_cmpx_ngt_f16 v1.h, v2.l +// GFX11: v_cmpx_ngt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x16,0x7d] -v_cmpx_ngt_f16 0.5, v2 -// GFX11: v_cmpx_ngt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x16,0x7d] +v_cmpx_ngt_f16 v127.h, v2.l +// GFX11: v_cmpx_ngt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x16,0x7d] -v_cmpx_ngt_f16 src_scc, v2 -// GFX11: v_cmpx_ngt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x16,0x7d] +v_cmpx_ngt_f16 0.5, v127.l +// GFX11: v_cmpx_ngt_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x16,0x7d] -v_cmpx_ngt_f16 0xfe0b, v127 -// GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ngt_f16 src_scc, v2.h +// GFX11: v_cmpx_ngt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x17,0x7d] + +v_cmpx_ngt_f16 0xfe0b, v127.h +// GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ngt_f32 v1, v2 // GFX11: v_cmpx_ngt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x36,0x7d] @@ -3287,50 +3407,65 @@ v_cmpx_ngt_f64 src_scc, v[2:3] v_cmpx_ngt_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nle_f16 v1, v2 -// GFX11: v_cmpx_nle_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x18,0x7d] +v_cmpx_nle_f16 v1.l, v2.l +// GFX11: v_cmpx_nle_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x18,0x7d] + +v_cmpx_nle_f16 v127.l, v2.l +// GFX11: v_cmpx_nle_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x18,0x7d] + +v_cmpx_nle_f16 s1, v2.l +// GFX11: v_cmpx_nle_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x18,0x7d] + +v_cmpx_nle_f16 s105, v2.l +// GFX11: v_cmpx_nle_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x18,0x7d] + +v_cmpx_nle_f16 vcc_lo, v2.l +// GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x18,0x7d] -v_cmpx_nle_f16 v127, v2 -// GFX11: v_cmpx_nle_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x18,0x7d] +v_cmpx_nle_f16 vcc_hi, v2.l +// GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x18,0x7d] -v_cmpx_nle_f16 s1, v2 -// GFX11: v_cmpx_nle_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x18,0x7d] +v_cmpx_nle_f16 ttmp15, v2.l +// GFX11: v_cmpx_nle_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x18,0x7d] -v_cmpx_nle_f16 s105, v2 -// GFX11: v_cmpx_nle_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x18,0x7d] +v_cmpx_nle_f16 m0, v2.l +// GFX11: v_cmpx_nle_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x18,0x7d] -v_cmpx_nle_f16 vcc_lo, v2 -// GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x18,0x7d] +v_cmpx_nle_f16 exec_lo, v2.l +// GFX11: v_cmpx_nle_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x18,0x7d] -v_cmpx_nle_f16 vcc_hi, v2 -// GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x18,0x7d] +v_cmpx_nle_f16 exec_hi, v2.l +// GFX11: v_cmpx_nle_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x18,0x7d] -v_cmpx_nle_f16 ttmp15, v2 -// GFX11: v_cmpx_nle_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x18,0x7d] +v_cmpx_nle_f16 null, v2.l +// GFX11: v_cmpx_nle_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x18,0x7d] -v_cmpx_nle_f16 m0, v2 -// GFX11: v_cmpx_nle_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x18,0x7d] +v_cmpx_nle_f16 -1, v2.l +// GFX11: v_cmpx_nle_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x18,0x7d] -v_cmpx_nle_f16 exec_lo, v2 -// GFX11: v_cmpx_nle_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x18,0x7d] +v_cmpx_nle_f16 0.5, v2.l +// GFX11: v_cmpx_nle_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x18,0x7d] -v_cmpx_nle_f16 exec_hi, v2 -// GFX11: v_cmpx_nle_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x18,0x7d] +v_cmpx_nle_f16 src_scc, v2.l +// GFX11: v_cmpx_nle_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x18,0x7d] -v_cmpx_nle_f16 null, v2 -// GFX11: v_cmpx_nle_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x18,0x7d] +v_cmpx_nle_f16 0xfe0b, v127.l +// GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_nle_f16 -1, v2 -// GFX11: v_cmpx_nle_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x18,0x7d] +v_cmpx_nle_f16 v1.h, v2.l +// GFX11: v_cmpx_nle_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x18,0x7d] -v_cmpx_nle_f16 0.5, v2 -// GFX11: v_cmpx_nle_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x18,0x7d] +v_cmpx_nle_f16 v127.h, v2.l +// GFX11: v_cmpx_nle_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x18,0x7d] -v_cmpx_nle_f16 src_scc, v2 -// GFX11: v_cmpx_nle_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x18,0x7d] +v_cmpx_nle_f16 0.5, v127.l +// GFX11: v_cmpx_nle_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x18,0x7d] -v_cmpx_nle_f16 0xfe0b, v127 -// GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nle_f16 src_scc, v2.h +// GFX11: v_cmpx_nle_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x19,0x7d] + +v_cmpx_nle_f16 0xfe0b, v127.h +// GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nle_f32 v1, v2 // GFX11: v_cmpx_nle_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x38,0x7d] @@ -3413,50 +3548,65 @@ v_cmpx_nle_f64 src_scc, v[2:3] v_cmpx_nle_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nlg_f16 v1, v2 -// GFX11: v_cmpx_nlg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x14,0x7d] +v_cmpx_nlg_f16 v1.l, v2.l +// GFX11: v_cmpx_nlg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x14,0x7d] + +v_cmpx_nlg_f16 v127.l, v2.l +// GFX11: v_cmpx_nlg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x14,0x7d] + +v_cmpx_nlg_f16 s1, v2.l +// GFX11: v_cmpx_nlg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x14,0x7d] + +v_cmpx_nlg_f16 s105, v2.l +// GFX11: v_cmpx_nlg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x14,0x7d] + +v_cmpx_nlg_f16 vcc_lo, v2.l +// GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x14,0x7d] -v_cmpx_nlg_f16 v127, v2 -// GFX11: v_cmpx_nlg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x14,0x7d] +v_cmpx_nlg_f16 vcc_hi, v2.l +// GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x14,0x7d] -v_cmpx_nlg_f16 s1, v2 -// GFX11: v_cmpx_nlg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x14,0x7d] +v_cmpx_nlg_f16 ttmp15, v2.l +// GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x14,0x7d] -v_cmpx_nlg_f16 s105, v2 -// GFX11: v_cmpx_nlg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x14,0x7d] +v_cmpx_nlg_f16 m0, v2.l +// GFX11: v_cmpx_nlg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x14,0x7d] -v_cmpx_nlg_f16 vcc_lo, v2 -// GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x14,0x7d] +v_cmpx_nlg_f16 exec_lo, v2.l +// GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x14,0x7d] -v_cmpx_nlg_f16 vcc_hi, v2 -// GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x14,0x7d] +v_cmpx_nlg_f16 exec_hi, v2.l +// GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x14,0x7d] -v_cmpx_nlg_f16 ttmp15, v2 -// GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x14,0x7d] +v_cmpx_nlg_f16 null, v2.l +// GFX11: v_cmpx_nlg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x14,0x7d] -v_cmpx_nlg_f16 m0, v2 -// GFX11: v_cmpx_nlg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x14,0x7d] +v_cmpx_nlg_f16 -1, v2.l +// GFX11: v_cmpx_nlg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x14,0x7d] -v_cmpx_nlg_f16 exec_lo, v2 -// GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x14,0x7d] +v_cmpx_nlg_f16 0.5, v2.l +// GFX11: v_cmpx_nlg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x14,0x7d] -v_cmpx_nlg_f16 exec_hi, v2 -// GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x14,0x7d] +v_cmpx_nlg_f16 src_scc, v2.l +// GFX11: v_cmpx_nlg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x14,0x7d] -v_cmpx_nlg_f16 null, v2 -// GFX11: v_cmpx_nlg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x14,0x7d] +v_cmpx_nlg_f16 0xfe0b, v127.l +// GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_nlg_f16 -1, v2 -// GFX11: v_cmpx_nlg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x14,0x7d] +v_cmpx_nlg_f16 v1.h, v2.l +// GFX11: v_cmpx_nlg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x14,0x7d] -v_cmpx_nlg_f16 0.5, v2 -// GFX11: v_cmpx_nlg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x14,0x7d] +v_cmpx_nlg_f16 v127.h, v2.l +// GFX11: v_cmpx_nlg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x14,0x7d] -v_cmpx_nlg_f16 src_scc, v2 -// GFX11: v_cmpx_nlg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x14,0x7d] +v_cmpx_nlg_f16 0.5, v127.l +// GFX11: v_cmpx_nlg_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x14,0x7d] -v_cmpx_nlg_f16 0xfe0b, v127 -// GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nlg_f16 src_scc, v2.h +// GFX11: v_cmpx_nlg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x15,0x7d] + +v_cmpx_nlg_f16 0xfe0b, v127.h +// GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nlg_f32 v1, v2 // GFX11: v_cmpx_nlg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x34,0x7d] @@ -3539,50 +3689,65 @@ v_cmpx_nlg_f64 src_scc, v[2:3] v_cmpx_nlg_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nlt_f16 v1, v2 -// GFX11: v_cmpx_nlt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1c,0x7d] +v_cmpx_nlt_f16 v1.l, v2.l +// GFX11: v_cmpx_nlt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1c,0x7d] + +v_cmpx_nlt_f16 v127.l, v2.l +// GFX11: v_cmpx_nlt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1c,0x7d] + +v_cmpx_nlt_f16 s1, v2.l +// GFX11: v_cmpx_nlt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1c,0x7d] + +v_cmpx_nlt_f16 s105, v2.l +// GFX11: v_cmpx_nlt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1c,0x7d] + +v_cmpx_nlt_f16 vcc_lo, v2.l +// GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 v127, v2 -// GFX11: v_cmpx_nlt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1c,0x7d] +v_cmpx_nlt_f16 vcc_hi, v2.l +// GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 s1, v2 -// GFX11: v_cmpx_nlt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 ttmp15, v2.l +// GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 s105, v2 -// GFX11: v_cmpx_nlt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 m0, v2.l +// GFX11: v_cmpx_nlt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 vcc_lo, v2 -// GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 exec_lo, v2.l +// GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 vcc_hi, v2 -// GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 exec_hi, v2.l +// GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 ttmp15, v2 -// GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 null, v2.l +// GFX11: v_cmpx_nlt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 m0, v2 -// GFX11: v_cmpx_nlt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 -1, v2.l +// GFX11: v_cmpx_nlt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 exec_lo, v2 -// GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 0.5, v2.l +// GFX11: v_cmpx_nlt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 exec_hi, v2 -// GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 src_scc, v2.l +// GFX11: v_cmpx_nlt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 null, v2 -// GFX11: v_cmpx_nlt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 0xfe0b, v127.l +// GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_nlt_f16 -1, v2 -// GFX11: v_cmpx_nlt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 v1.h, v2.l +// GFX11: v_cmpx_nlt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1c,0x7d] -v_cmpx_nlt_f16 0.5, v2 -// GFX11: v_cmpx_nlt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 v127.h, v2.l +// GFX11: v_cmpx_nlt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1c,0x7d] -v_cmpx_nlt_f16 src_scc, v2 -// GFX11: v_cmpx_nlt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 0.5, v127.l +// GFX11: v_cmpx_nlt_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1c,0x7d] -v_cmpx_nlt_f16 0xfe0b, v127 -// GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nlt_f16 src_scc, v2.h +// GFX11: v_cmpx_nlt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1d,0x7d] + +v_cmpx_nlt_f16 0xfe0b, v127.h +// GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nlt_f32 v1, v2 // GFX11: v_cmpx_nlt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3c,0x7d] @@ -3665,50 +3830,65 @@ v_cmpx_nlt_f64 src_scc, v[2:3] v_cmpx_nlt_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_o_f16 v1, v2 -// GFX11: v_cmpx_o_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0e,0x7d] +v_cmpx_o_f16 v1.l, v2.l +// GFX11: v_cmpx_o_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0e,0x7d] + +v_cmpx_o_f16 v127.l, v2.l +// GFX11: v_cmpx_o_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0e,0x7d] + +v_cmpx_o_f16 s1, v2.l +// GFX11: v_cmpx_o_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0e,0x7d] + +v_cmpx_o_f16 s105, v2.l +// GFX11: v_cmpx_o_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0e,0x7d] + +v_cmpx_o_f16 vcc_lo, v2.l +// GFX11: v_cmpx_o_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0e,0x7d] -v_cmpx_o_f16 v127, v2 -// GFX11: v_cmpx_o_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0e,0x7d] +v_cmpx_o_f16 vcc_hi, v2.l +// GFX11: v_cmpx_o_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0e,0x7d] -v_cmpx_o_f16 s1, v2 -// GFX11: v_cmpx_o_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0e,0x7d] +v_cmpx_o_f16 ttmp15, v2.l +// GFX11: v_cmpx_o_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0e,0x7d] -v_cmpx_o_f16 s105, v2 -// GFX11: v_cmpx_o_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0e,0x7d] +v_cmpx_o_f16 m0, v2.l +// GFX11: v_cmpx_o_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0e,0x7d] -v_cmpx_o_f16 vcc_lo, v2 -// GFX11: v_cmpx_o_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0e,0x7d] +v_cmpx_o_f16 exec_lo, v2.l +// GFX11: v_cmpx_o_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0e,0x7d] -v_cmpx_o_f16 vcc_hi, v2 -// GFX11: v_cmpx_o_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0e,0x7d] +v_cmpx_o_f16 exec_hi, v2.l +// GFX11: v_cmpx_o_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0e,0x7d] -v_cmpx_o_f16 ttmp15, v2 -// GFX11: v_cmpx_o_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0e,0x7d] +v_cmpx_o_f16 null, v2.l +// GFX11: v_cmpx_o_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0e,0x7d] -v_cmpx_o_f16 m0, v2 -// GFX11: v_cmpx_o_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0e,0x7d] +v_cmpx_o_f16 -1, v2.l +// GFX11: v_cmpx_o_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0e,0x7d] -v_cmpx_o_f16 exec_lo, v2 -// GFX11: v_cmpx_o_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0e,0x7d] +v_cmpx_o_f16 0.5, v2.l +// GFX11: v_cmpx_o_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0e,0x7d] -v_cmpx_o_f16 exec_hi, v2 -// GFX11: v_cmpx_o_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0e,0x7d] +v_cmpx_o_f16 src_scc, v2.l +// GFX11: v_cmpx_o_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0e,0x7d] -v_cmpx_o_f16 null, v2 -// GFX11: v_cmpx_o_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0e,0x7d] +v_cmpx_o_f16 0xfe0b, v127.l +// GFX11: v_cmpx_o_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_o_f16 -1, v2 -// GFX11: v_cmpx_o_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0e,0x7d] +v_cmpx_o_f16 v1.h, v2.l +// GFX11: v_cmpx_o_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0e,0x7d] -v_cmpx_o_f16 0.5, v2 -// GFX11: v_cmpx_o_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0e,0x7d] +v_cmpx_o_f16 v127.h, v2.l +// GFX11: v_cmpx_o_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0e,0x7d] -v_cmpx_o_f16 src_scc, v2 -// GFX11: v_cmpx_o_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0e,0x7d] +v_cmpx_o_f16 0.5, v127.l +// GFX11: v_cmpx_o_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x0e,0x7d] -v_cmpx_o_f16 0xfe0b, v127 -// GFX11: v_cmpx_o_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_o_f16 src_scc, v2.h +// GFX11: v_cmpx_o_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0f,0x7d] + +v_cmpx_o_f16 0xfe0b, v127.h +// GFX11: v_cmpx_o_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_o_f32 v1, v2 // GFX11: v_cmpx_o_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2e,0x7d] @@ -3791,50 +3971,80 @@ v_cmpx_o_f64 src_scc, v[2:3] v_cmpx_o_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_t_f16 v1, v2 -// GFX11: v_cmpx_t_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1e,0x7d] +v_cmpx_t_f16 v1.l, v2.l +// GFX11: v_cmpx_t_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1e,0x7d] -v_cmpx_t_f16 v127, v2 -// GFX11: v_cmpx_t_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1e,0x7d] +v_cmpx_t_f16 v127.l, v2.l +// GFX11: v_cmpx_t_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1e,0x7d] -v_cmpx_t_f16 s1, v2 -// GFX11: v_cmpx_t_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1e,0x7d] +v_cmpx_t_f16 s1, v2.l +// GFX11: v_cmpx_t_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1e,0x7d] -v_cmpx_t_f16 s105, v2 -// GFX11: v_cmpx_t_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1e,0x7d] +v_cmpx_t_f16 s105, v2.l +// GFX11: v_cmpx_t_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1e,0x7d] -v_cmpx_t_f16 vcc_lo, v2 -// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1e,0x7d] +v_cmpx_t_f16 vcc_lo, v2.l +// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1e,0x7d] -v_cmpx_t_f16 vcc_hi, v2 -// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1e,0x7d] +v_cmpx_t_f16 vcc_hi, v2.l +// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1e,0x7d] -v_cmpx_t_f16 ttmp15, v2 -// GFX11: v_cmpx_t_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1e,0x7d] +v_cmpx_t_f16 ttmp15, v2.l +// GFX11: v_cmpx_t_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1e,0x7d] -v_cmpx_t_f16 m0, v2 -// GFX11: v_cmpx_t_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1e,0x7d] +v_cmpx_t_f16 m0, v2.l +// GFX11: v_cmpx_t_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1e,0x7d] -v_cmpx_t_f16 exec_lo, v2 -// GFX11: v_cmpx_t_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1e,0x7d] +v_cmpx_t_f16 exec_lo, v2.l +// GFX11: v_cmpx_t_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1e,0x7d] -v_cmpx_t_f16 exec_hi, v2 -// GFX11: v_cmpx_t_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1e,0x7d] +v_cmpx_t_f16 exec_hi, v2.l +// GFX11: v_cmpx_t_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1e,0x7d] -v_cmpx_t_f16 null, v2 -// GFX11: v_cmpx_t_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1e,0x7d] +v_cmpx_t_f16 null, v2.l +// GFX11: v_cmpx_t_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1e,0x7d] -v_cmpx_t_f16 -1, v2 -// GFX11: v_cmpx_t_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1e,0x7d] +v_cmpx_t_f16 -1, v2.l +// GFX11: v_cmpx_t_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1e,0x7d] -v_cmpx_t_f16 0.5, v2 -// GFX11: v_cmpx_t_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1e,0x7d] +v_cmpx_t_f16 0.5, v2.l +// GFX11: v_cmpx_t_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1e,0x7d] -v_cmpx_t_f16 src_scc, v2 -// GFX11: v_cmpx_t_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1e,0x7d] +v_cmpx_t_f16 src_scc, v2.l +// GFX11: v_cmpx_t_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1e,0x7d] -v_cmpx_t_f16 0xfe0b, v127 -// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_t_f16 0xfe0b, v127.l +// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_t_f16 v1.h, v2.l +// GFX11: v_cmpx_t_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1e,0x7d] + +v_cmpx_tru_f16 v1.h, v2.l +// GFX11: v_cmpx_t_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1e,0x7d] + +v_cmpx_t_f16 v127.h, v2.l +// GFX11: v_cmpx_t_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1e,0x7d] + +v_cmpx_tru_f16 v127.h, v2.l +// GFX11: v_cmpx_t_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1e,0x7d] + +v_cmpx_t_f16 0.5, v127.l +// GFX11: v_cmpx_t_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1e,0x7d] + +v_cmpx_tru_f16 0.5, v127.l +// GFX11: v_cmpx_t_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1e,0x7d] + +v_cmpx_t_f16 src_scc, v2.h +// GFX11: v_cmpx_t_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1f,0x7d] + +v_cmpx_tru_f16 src_scc, v2.h +// GFX11: v_cmpx_t_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1f,0x7d] + +v_cmpx_t_f16 0xfe0b, v127.h +// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_tru_f16 0xfe0b, v127.h +// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_t_f32 v1, v2 // GFX11: v_cmpx_t_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3e,0x7d] @@ -4079,41 +4289,41 @@ v_cmpx_t_u64 src_scc, v[2:3] v_cmpx_t_u64 0xaf123456, v[254:255] // GFX11: v_cmpx_t_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_tru_f16 v1, v2 -// GFX11: v_cmpx_t_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1e,0x7d] +v_cmpx_tru_f16 v1.l, v2.l +// GFX11: v_cmpx_t_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1e,0x7d] -v_cmpx_tru_f16 v127, v2 -// GFX11: v_cmpx_t_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1e,0x7d] +v_cmpx_tru_f16 v127.l, v2.l +// GFX11: v_cmpx_t_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1e,0x7d] -v_cmpx_tru_f16 s1, v2 -// GFX11: v_cmpx_t_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1e,0x7d] +v_cmpx_tru_f16 s1, v2.l +// GFX11: v_cmpx_t_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1e,0x7d] -v_cmpx_tru_f16 s105, v2 -// GFX11: v_cmpx_t_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1e,0x7d] +v_cmpx_tru_f16 s105, v2.l +// GFX11: v_cmpx_t_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1e,0x7d] -v_cmpx_tru_f16 vcc_lo, v2 -// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1e,0x7d] +v_cmpx_tru_f16 vcc_lo, v2.l +// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1e,0x7d] -v_cmpx_tru_f16 vcc_hi, v2 -// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1e,0x7d] +v_cmpx_tru_f16 vcc_hi, v2.l +// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1e,0x7d] -v_cmpx_tru_f16 ttmp15, v2 -// GFX11: v_cmpx_t_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1e,0x7d] +v_cmpx_tru_f16 ttmp15, v2.l +// GFX11: v_cmpx_t_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1e,0x7d] -v_cmpx_tru_f16 m0, v2 -// GFX11: v_cmpx_t_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1e,0x7d] +v_cmpx_tru_f16 m0, v2.l +// GFX11: v_cmpx_t_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1e,0x7d] -v_cmpx_tru_f16 exec_lo, v2 -// GFX11: v_cmpx_t_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1e,0x7d] +v_cmpx_tru_f16 exec_lo, v2.l +// GFX11: v_cmpx_t_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1e,0x7d] -v_cmpx_tru_f16 exec_hi, v2 -// GFX11: v_cmpx_t_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1e,0x7d] +v_cmpx_tru_f16 exec_hi, v2.l +// GFX11: v_cmpx_t_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1e,0x7d] -v_cmpx_tru_f16 null, v2 -// GFX11: v_cmpx_t_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1e,0x7d] +v_cmpx_tru_f16 null, v2.l +// GFX11: v_cmpx_t_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1e,0x7d] -v_cmpx_tru_f16 -1, v2 -// GFX11: v_cmpx_t_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1e,0x7d] +v_cmpx_tru_f16 -1, v2.l +// GFX11: v_cmpx_t_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1e,0x7d] v_cmpx_tru_f16 0.5, v2 // GFX11: v_cmpx_t_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1e,0x7d] @@ -4205,50 +4415,65 @@ v_cmpx_tru_f64 src_scc, v[2:3] v_cmpx_tru_f64 0xaf123456, v[254:255] // GFX11: v_cmpx_t_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5f,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_u_f16 v1, v2 -// GFX11: v_cmpx_u_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x10,0x7d] +v_cmpx_u_f16 v1.l, v2.l +// GFX11: v_cmpx_u_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x10,0x7d] + +v_cmpx_u_f16 v127.l, v2.l +// GFX11: v_cmpx_u_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x10,0x7d] + +v_cmpx_u_f16 s1, v2.l +// GFX11: v_cmpx_u_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x10,0x7d] + +v_cmpx_u_f16 s105, v2.l +// GFX11: v_cmpx_u_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x10,0x7d] + +v_cmpx_u_f16 vcc_lo, v2.l +// GFX11: v_cmpx_u_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x10,0x7d] + +v_cmpx_u_f16 vcc_hi, v2.l +// GFX11: v_cmpx_u_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x10,0x7d] -v_cmpx_u_f16 v127, v2 -// GFX11: v_cmpx_u_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x10,0x7d] +v_cmpx_u_f16 ttmp15, v2.l +// GFX11: v_cmpx_u_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x10,0x7d] -v_cmpx_u_f16 s1, v2 -// GFX11: v_cmpx_u_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x10,0x7d] +v_cmpx_u_f16 m0, v2.l +// GFX11: v_cmpx_u_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x10,0x7d] -v_cmpx_u_f16 s105, v2 -// GFX11: v_cmpx_u_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x10,0x7d] +v_cmpx_u_f16 exec_lo, v2.l +// GFX11: v_cmpx_u_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x10,0x7d] -v_cmpx_u_f16 vcc_lo, v2 -// GFX11: v_cmpx_u_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x10,0x7d] +v_cmpx_u_f16 exec_hi, v2.l +// GFX11: v_cmpx_u_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x10,0x7d] -v_cmpx_u_f16 vcc_hi, v2 -// GFX11: v_cmpx_u_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x10,0x7d] +v_cmpx_u_f16 null, v2.l +// GFX11: v_cmpx_u_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x10,0x7d] -v_cmpx_u_f16 ttmp15, v2 -// GFX11: v_cmpx_u_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x10,0x7d] +v_cmpx_u_f16 -1, v2.l +// GFX11: v_cmpx_u_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x10,0x7d] -v_cmpx_u_f16 m0, v2 -// GFX11: v_cmpx_u_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x10,0x7d] +v_cmpx_u_f16 0.5, v2.l +// GFX11: v_cmpx_u_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x10,0x7d] -v_cmpx_u_f16 exec_lo, v2 -// GFX11: v_cmpx_u_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x10,0x7d] +v_cmpx_u_f16 src_scc, v2.l +// GFX11: v_cmpx_u_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x10,0x7d] -v_cmpx_u_f16 exec_hi, v2 -// GFX11: v_cmpx_u_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x10,0x7d] +v_cmpx_u_f16 0xfe0b, v127.l +// GFX11: v_cmpx_u_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_u_f16 null, v2 -// GFX11: v_cmpx_u_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x10,0x7d] +v_cmpx_u_f16 v1.h, v2.l +// GFX11: v_cmpx_u_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x10,0x7d] -v_cmpx_u_f16 -1, v2 -// GFX11: v_cmpx_u_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] +v_cmpx_u_f16 v127.h, v2.l +// GFX11: v_cmpx_u_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x10,0x7d] -v_cmpx_u_f16 0.5, v2 -// GFX11: v_cmpx_u_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x10,0x7d] +v_cmpx_u_f16 0.5, v127.l +// GFX11: v_cmpx_u_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x10,0x7d] -v_cmpx_u_f16 src_scc, v2 -// GFX11: v_cmpx_u_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x10,0x7d] +v_cmpx_u_f16 src_scc, v2.h +// GFX11: v_cmpx_u_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x11,0x7d] -v_cmpx_u_f16 0xfe0b, v127 -// GFX11: v_cmpx_u_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_u_f16 0xfe0b, v127.h +// GFX11: v_cmpx_u_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_u_f32 v1, v2 // GFX11: v_cmpx_u_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x30,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s index e8d458874596e..1864a32c9f133 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s @@ -374,47 +374,56 @@ v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_f_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_f_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_f_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_f_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_f_f16 v1, v2 row_mirror -// GFX11: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_f_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_f_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_f_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_f_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_f_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_f_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_f_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_f_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_f_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_f_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x01,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_f_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_f_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff] @@ -542,47 +551,56 @@ v_cmpx_f_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ge_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_ge_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_mirror -// GFX11: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ge_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff] @@ -812,47 +830,56 @@ v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_gt_f16 v1, v2 row_mirror -// GFX11: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_gt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_gt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff] @@ -1082,47 +1109,56 @@ v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_le_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_le_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_mirror -// GFX11: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_le_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_le_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_le_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff] @@ -1352,47 +1388,56 @@ v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_lg_f16 v1, v2 row_mirror -// GFX11: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_lg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_lg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff] @@ -1901,47 +1946,56 @@ v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_neq_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_neq_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_mirror -// GFX11: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_neq_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_neq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_neq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff] @@ -1985,47 +2039,56 @@ v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_nge_f16 v1, v2 row_mirror -// GFX11: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_nge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff] @@ -2069,47 +2132,56 @@ v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_ngt_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_mirror -// GFX11: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ngt_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ngt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_ngt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff] @@ -2153,47 +2225,56 @@ v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_nle_f16 v1, v2 row_mirror -// GFX11: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_nle_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nle_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff] @@ -2237,47 +2318,56 @@ v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_nlg_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_mirror -// GFX11: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nlg_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nlg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff] @@ -2321,47 +2411,56 @@ v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_nlt_f16 v1, v2 row_mirror -// GFX11: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_nlt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_nlt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff] @@ -2405,47 +2504,56 @@ v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_o_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_o_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_mirror -// GFX11: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_o_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_o_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_o_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff] @@ -2489,47 +2597,65 @@ v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_t_f16 v1, v2 row_mirror -// GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_t_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_t_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_t_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_t_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_tru_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01] + +v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_tru_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xf5,0x30] + +v_cmpx_tru_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff] @@ -2657,38 +2783,38 @@ v_cmpx_t_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_tru_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_tru_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_tru_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_tru_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_tru_f16 v1, v2 row_mirror -// GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_tru_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_tru_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] v_cmpx_tru_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 // GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01] @@ -2741,47 +2867,56 @@ v_cmpx_tru_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_tru_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_cmpx_t_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3f,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX11: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_u_f16 v1.l, v2.l row_mirror +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] + +v_cmpx_u_f16 v1.l, v2.l row_half_mirror +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shl:1 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_mirror -// GFX11: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shl:15 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_half_mirror -// GFX11: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shr:1 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shl:1 -// GFX11: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shr:15 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shl:15 -// GFX11: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_ror:1 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shr:1 -// GFX11: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_ror:15 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shr:15 -// GFX11: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_ror:1 -// GFX11: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_u_f16 v1, v2 row_ror:15 -// GFX11: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX11: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_u_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_cmpx_u_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01] -v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] // GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s index 4f8895faf10a2..1d664e4ecb902 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s @@ -110,14 +110,23 @@ v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_f_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_f_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x01,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00] v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05] @@ -146,14 +155,23 @@ v_cmpx_f_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05] @@ -218,14 +236,23 @@ v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05] @@ -290,14 +317,23 @@ v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05] @@ -362,14 +398,23 @@ v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05] @@ -515,14 +560,23 @@ v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05] @@ -533,14 +587,23 @@ v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05] @@ -551,14 +614,23 @@ v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05] @@ -569,14 +641,23 @@ v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05] @@ -587,14 +668,23 @@ v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05] @@ -605,14 +695,23 @@ v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05] @@ -623,14 +722,23 @@ v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05] @@ -641,14 +749,32 @@ v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_t_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_t_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_tru_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05] + +v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_tru_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00] + +v_cmpx_tru_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00] v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05] @@ -677,8 +803,8 @@ v_cmpx_t_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_tru_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_tru_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] v_cmpx_tru_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] @@ -695,14 +821,23 @@ v_cmpx_tru_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_tru_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_cmpx_t_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3f,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX11: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05] -v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s index fe2220d7f5902..a1ab032e85d75 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s @@ -145,41 +145,77 @@ v_cmpx_eq_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_f_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_f_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_f_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_f_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_f_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_f_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_f_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_f_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_f_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_f_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_f_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_f_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_f_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_f_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_f_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_f_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_f_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_f_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_ge_i16_e32 v1.h, v255.h // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -253,23 +289,41 @@ v_cmpx_ge_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_gt_i16_e32 v1.h, v255.h // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -343,23 +397,41 @@ v_cmpx_gt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_le_i16_e32 v1.h, v255.h // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -433,23 +505,41 @@ v_cmpx_le_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lg_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lg_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_lt_f16_e32 v1.h, v255.h // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -631,148 +721,292 @@ v_cmpx_ne_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ne_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_neq_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_neq_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_neq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_neq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ngt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ngt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ngt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_ngt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ngt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nle_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nle_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nle_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nle_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nle_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nle_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nle_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nle_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_o_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nle_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_o_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_o_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_o_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_o_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_o_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_t_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_t_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_t_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlg_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_t_f16_e32 v255, v2 +v_cmpx_o_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_o_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_t_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmpx_t_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_t_f16_e32 v255, v2 quad_perm:[3,2,1,0] +v_cmpx_t_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_t_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction v_cmpx_tru_f16_e32 v1, v255 @@ -793,20 +1027,38 @@ v_cmpx_tru_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] v_cmpx_tru_f16_e32 v255, v2 quad_perm:[3,2,1,0] // GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_u_f16_e32 v1, v255 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_u_f16_e32 v1.h, v255.h +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_u_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_u_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_u_f16_e32 v255, v2 -// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_u_f16_e32 v1.l, v255.l +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_u_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_u_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction + +v_cmpx_u_f16_e32 v255.h, v2.h +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_u_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_u_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_u_f16_e32 v255.l, v2.l +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_u_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction + +v_cmpx_u_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s index c6814de818e6d..233858fd3021c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s @@ -145,41 +145,77 @@ v_cmpx_eq_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_f_f16 v1, v255 -// GFX11: v_cmpx_f_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_f_f16 v1.h, v255.h +// GFX11: v_cmpx_f_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x80,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_f_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_f_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_f_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_f_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_f_f16 v255, v2 -// GFX11: v_cmpx_f_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_f_f16 v1.l, v255.l +// GFX11: v_cmpx_f_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_f_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_f_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_f_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_f_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_f_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_f16 v1, v255 -// GFX11: v_cmpx_ge_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_f_f16 v255.h, v2.h +// GFX11: v_cmpx_f_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x80,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_f_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ge_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_f_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_f16 v255, v2 -// GFX11: v_cmpx_ge_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_f_f16 v255.l, v2.l +// GFX11: v_cmpx_f_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_ge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_f_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_ge_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_f_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_f_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v1.h, v255.h +// GFX11: v_cmpx_ge_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v1.l, v255.l +// GFX11: v_cmpx_ge_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v255.h, v2.h +// GFX11: v_cmpx_ge_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v255.l, v2.l +// GFX11: v_cmpx_ge_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_i16 v1.h, v255.h // GFX11: v_cmpx_ge_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] @@ -253,23 +289,41 @@ v_cmpx_ge_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_gt_f16 v1, v255 -// GFX11: v_cmpx_gt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_gt_f16 v1.h, v255.h +// GFX11: v_cmpx_gt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_f16 v255, v2 -// GFX11: v_cmpx_gt_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_gt_f16 v1.l, v255.l +// GFX11: v_cmpx_gt_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_gt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_gt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_gt_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_f16 v255.h, v2.h +// GFX11: v_cmpx_gt_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_gt_f16 v255.l, v2.l +// GFX11: v_cmpx_gt_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_gt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_i16 v1.h, v255.h // GFX11: v_cmpx_gt_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] @@ -343,23 +397,41 @@ v_cmpx_gt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_f16 v1, v255 -// GFX11: v_cmpx_le_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_le_f16 v1.h, v255.h +// GFX11: v_cmpx_le_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16 v1.l, v255.l +// GFX11: v_cmpx_le_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_le_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_f16 v255.h, v2.h +// GFX11: v_cmpx_le_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_f16 v255, v2 -// GFX11: v_cmpx_le_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_le_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_le_f16 v255.l, v2.l +// GFX11: v_cmpx_le_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_le_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_le_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_le_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_le_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_i16 v1.h, v255.h // GFX11: v_cmpx_le_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] @@ -433,23 +505,41 @@ v_cmpx_le_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lg_f16 v1, v255 -// GFX11: v_cmpx_lg_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_lg_f16 v1.h, v255.h +// GFX11: v_cmpx_lg_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lg_f16 v1.l, v255.l +// GFX11: v_cmpx_lg_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_lg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16 v255.h, v2.h +// GFX11: v_cmpx_lg_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_lg_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_lg_f16 v255, v2 -// GFX11: v_cmpx_lg_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_lg_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_lg_f16 v255.l, v2.l +// GFX11: v_cmpx_lg_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_lg_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_lg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_lg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lg_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_lg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_f16 v1.h, v255.h // GFX11: v_cmpx_lt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00] @@ -631,182 +721,362 @@ v_cmpx_ne_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_neq_f16 v1, v255 -// GFX11: v_cmpx_neq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_neq_f16 v1.h, v255.h +// GFX11: v_cmpx_neq_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_neq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v1.l, v255.l +// GFX11: v_cmpx_neq_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_neq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v255.h, v2.h +// GFX11: v_cmpx_neq_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_neq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_neq_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v255.l, v2.l +// GFX11: v_cmpx_neq_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_neq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_neq_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_neq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v1.h, v255.h +// GFX11: v_cmpx_nge_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v1.l, v255.l +// GFX11: v_cmpx_nge_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v255.h, v2.h +// GFX11: v_cmpx_nge_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nge_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v255.l, v2.l +// GFX11: v_cmpx_nge_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nge_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v1.h, v255.h +// GFX11: v_cmpx_ngt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ngt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v1.l, v255.l +// GFX11: v_cmpx_ngt_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ngt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v255.h, v2.h +// GFX11: v_cmpx_ngt_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ngt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v255.l, v2.l +// GFX11: v_cmpx_ngt_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ngt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v1.h, v255.h +// GFX11: v_cmpx_nle_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nle_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v1.l, v255.l +// GFX11: v_cmpx_nle_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nle_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v255.h, v2.h +// GFX11: v_cmpx_nle_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nle_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nle_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v255.l, v2.l +// GFX11: v_cmpx_nle_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nle_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nle_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nle_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v1.h, v255.h +// GFX11: v_cmpx_nlg_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nlg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v1.l, v255.l +// GFX11: v_cmpx_nlg_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nlg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v255.h, v2.h +// GFX11: v_cmpx_nlg_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nlg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v255.l, v2.l +// GFX11: v_cmpx_nlg_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nlg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nlt_f16 v1.h, v255.h +// GFX11: v_cmpx_nlt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_neq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_neq_f16 v255, v2 -// GFX11: v_cmpx_neq_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_nlt_f16 v1.l, v255.l +// GFX11: v_cmpx_nlt_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_neq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_neq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nge_f16 v1, v255 -// GFX11: v_cmpx_nge_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_nlt_f16 v255.h, v2.h +// GFX11: v_cmpx_nlt_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nge_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nge_f16 v255, v2 -// GFX11: v_cmpx_nge_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_nlt_f16 v255.l, v2.l +// GFX11: v_cmpx_nlt_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_nlt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nge_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ngt_f16 v1, v255 -// GFX11: v_cmpx_ngt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_o_f16 v1.h, v255.h +// GFX11: v_cmpx_o_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ngt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16 v255, v2 -// GFX11: v_cmpx_ngt_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_o_f16 v1.l, v255.l +// GFX11: v_cmpx_o_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ngt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_o_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_ngt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_o_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nle_f16 v1, v255 -// GFX11: v_cmpx_nle_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_o_f16 v255.h, v2.h +// GFX11: v_cmpx_o_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nle_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nle_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nle_f16 v255, v2 -// GFX11: v_cmpx_nle_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_o_f16 v255.l, v2.l +// GFX11: v_cmpx_o_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nle_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_o_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nle_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nle_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_o_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_o_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nlg_f16 v1, v255 -// GFX11: v_cmpx_nlg_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_t_f16 v1.h, v255.h +// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_nlg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_t_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_t_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlg_f16 v255, v2 -// GFX11: v_cmpx_nlg_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_t_f16 v1.l, v255.l +// GFX11: v_cmpx_t_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_nlg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_t_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_t_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16 v1, v255 -// GFX11: v_cmpx_nlt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_t_f16 v255.h, v2.h +// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nlt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_t_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nlt_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_t_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nlt_f16 v255, v2 -// GFX11: v_cmpx_nlt_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_t_f16 v255.l, v2.l +// GFX11: v_cmpx_t_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nlt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_t_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nlt_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_nlt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_t_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_o_f16 v1, v255 -// GFX11: v_cmpx_o_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_tru_f16 v1.h, v255.h +// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_o_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_tru_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_o_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_tru_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_o_f16 v255, v2 -// GFX11: v_cmpx_o_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_tru_f16 v1.l, v255.l +// GFX11: v_cmpx_t_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_o_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_tru_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_o_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_o_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_tru_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_t_f16 v1, v255 -// GFX11: v_cmpx_t_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_tru_f16 v255.h, v2.h +// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_t_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_tru_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_t_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_tru_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_t_f16 v255, v2 -// GFX11: v_cmpx_t_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_tru_f16 v255.l, v2.l +// GFX11: v_cmpx_t_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_t_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_tru_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_t_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_tru_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_tru_f16 v1, v255 -// GFX11: v_cmpx_t_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_u_f16 v1.h, v255.h +// GFX11: v_cmpx_u_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_tru_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_tru_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_tru_f16 v255, v2 -// GFX11: v_cmpx_t_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_u_f16 v1.l, v255.l +// GFX11: v_cmpx_u_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_tru_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_u_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_tru_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_u_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_u_f16 v1, v255 -// GFX11: v_cmpx_u_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_u_f16 v255.h, v2.h +// GFX11: v_cmpx_u_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_u_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_u_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_u_f16 v255, v2 -// GFX11: v_cmpx_u_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_u_f16 v255.l, v2.l +// GFX11: v_cmpx_u_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_u_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_u_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_u_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_cmpx_u_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_u_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: v_cmpx_u_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s index 17bd81fa7d259..cfc7b2c5fb665 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s @@ -542,11 +542,11 @@ v_cmpx_eq_u64_e64 src_scc, exec v_cmpx_eq_u64_e64 0xaf123456, vcc // GFX12: v_cmpx_eq_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_ge_f16_e64 v1, v2 -// GFX12: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ge_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_ge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ge_f16_e64 v255, v255 -// GFX12: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ge_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_ge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ge_f16_e64 s1, s2 // GFX12: v_cmpx_ge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00] @@ -587,6 +587,12 @@ v_cmpx_ge_f16_e64 -src_scc, |vcc_lo| v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_ge_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ge_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_ge_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ge_f32_e64 v1, v2 // GFX12: v_cmpx_ge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00] @@ -932,11 +938,11 @@ v_cmpx_ge_u64_e64 src_scc, exec v_cmpx_ge_u64_e64 0xaf123456, vcc // GFX12: v_cmpx_ge_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_gt_f16_e64 v1, v2 -// GFX12: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_gt_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_gt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_gt_f16_e64 v255, v255 -// GFX12: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_gt_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_gt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] v_cmpx_gt_f16_e64 s1, s2 // GFX12: v_cmpx_gt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00] @@ -977,6 +983,12 @@ v_cmpx_gt_f16_e64 -src_scc, |vcc_lo| v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_gt_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_gt_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_gt_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_gt_f32_e64 v1, v2 // GFX12: v_cmpx_gt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00] @@ -1322,11 +1334,11 @@ v_cmpx_gt_u64_e64 src_scc, exec v_cmpx_gt_u64_e64 0xaf123456, vcc // GFX12: v_cmpx_gt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_le_f16_e64 v1, v2 -// GFX12: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_le_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_le_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_le_f16_e64 v255, v255 -// GFX12: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_le_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_le_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] v_cmpx_le_f16_e64 s1, s2 // GFX12: v_cmpx_le_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00] @@ -1367,6 +1379,12 @@ v_cmpx_le_f16_e64 -src_scc, |vcc_lo| v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_le_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_le_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_le_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_le_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_le_f32_e64 v1, v2 // GFX12: v_cmpx_le_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00] @@ -1712,11 +1730,11 @@ v_cmpx_le_u64_e64 src_scc, exec v_cmpx_le_u64_e64 0xaf123456, vcc // GFX12: v_cmpx_le_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_lg_f16_e64 v1, v2 -// GFX12: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_lg_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_lg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_lg_f16_e64 v255, v255 -// GFX12: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_lg_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_lg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] v_cmpx_lg_f16_e64 s1, s2 // GFX12: v_cmpx_lg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00] @@ -1757,6 +1775,12 @@ v_cmpx_lg_f16_e64 -src_scc, |vcc_lo| v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_lg_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_lg_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_lg_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_lg_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_lg_f32_e64 v1, v2 // GFX12: v_cmpx_lg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00] @@ -2498,11 +2522,11 @@ v_cmpx_ne_u64_e64 src_scc, exec v_cmpx_ne_u64_e64 0xaf123456, vcc // GFX12: v_cmpx_ne_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] -v_cmpx_neq_f16_e64 v1, v2 -// GFX12: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_neq_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_neq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_neq_f16_e64 v255, v255 -// GFX12: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_neq_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_neq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] v_cmpx_neq_f16_e64 s1, s2 // GFX12: v_cmpx_neq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00] @@ -2543,6 +2567,12 @@ v_cmpx_neq_f16_e64 -src_scc, |vcc_lo| v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_neq_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_neq_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_neq_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_neq_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_neq_f32_e64 v1, v2 // GFX12: v_cmpx_neq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00] @@ -2624,11 +2654,11 @@ v_cmpx_neq_f64_e64 -|src_scc|, -|exec| v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nge_f16_e64 v1, v2 -// GFX12: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nge_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_nge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nge_f16_e64 v255, v255 -// GFX12: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nge_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_nge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nge_f16_e64 s1, s2 // GFX12: v_cmpx_nge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00] @@ -2669,6 +2699,12 @@ v_cmpx_nge_f16_e64 -src_scc, |vcc_lo| v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nge_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_nge_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nge_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_nge_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nge_f32_e64 v1, v2 // GFX12: v_cmpx_nge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00] @@ -2750,11 +2786,11 @@ v_cmpx_nge_f64_e64 -|src_scc|, -|exec| v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_ngt_f16_e64 v1, v2 -// GFX12: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_ngt_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_ngt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_ngt_f16_e64 v255, v255 -// GFX12: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_ngt_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_ngt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] v_cmpx_ngt_f16_e64 s1, s2 // GFX12: v_cmpx_ngt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00] @@ -2795,6 +2831,12 @@ v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo| v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_ngt_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_ngt_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_ngt_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_ngt_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_ngt_f32_e64 v1, v2 // GFX12: v_cmpx_ngt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00] @@ -2876,11 +2918,11 @@ v_cmpx_ngt_f64_e64 -|src_scc|, -|exec| v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nle_f16_e64 v1, v2 -// GFX12: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nle_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_nle_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nle_f16_e64 v255, v255 -// GFX12: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nle_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_nle_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nle_f16_e64 s1, s2 // GFX12: v_cmpx_nle_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00] @@ -2921,6 +2963,12 @@ v_cmpx_nle_f16_e64 -src_scc, |vcc_lo| v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nle_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_nle_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nle_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_nle_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nle_f32_e64 v1, v2 // GFX12: v_cmpx_nle_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00] @@ -3002,11 +3050,11 @@ v_cmpx_nle_f64_e64 -|src_scc|, -|exec| v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nlg_f16_e64 v1, v2 -// GFX12: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nlg_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_nlg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nlg_f16_e64 v255, v255 -// GFX12: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nlg_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_nlg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlg_f16_e64 s1, s2 // GFX12: v_cmpx_nlg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00] @@ -3047,6 +3095,12 @@ v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo| v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nlg_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_nlg_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nlg_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_nlg_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nlg_f32_e64 v1, v2 // GFX12: v_cmpx_nlg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00] @@ -3128,11 +3182,11 @@ v_cmpx_nlg_f64_e64 -|src_scc|, -|exec| v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_nlt_f16_e64 v1, v2 -// GFX12: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_nlt_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_nlt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_nlt_f16_e64 v255, v255 -// GFX12: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_nlt_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_nlt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] v_cmpx_nlt_f16_e64 s1, s2 // GFX12: v_cmpx_nlt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00] @@ -3173,6 +3227,12 @@ v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo| v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_nlt_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_nlt_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_nlt_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_nlt_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_nlt_f32_e64 v1, v2 // GFX12: v_cmpx_nlt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00] @@ -3254,11 +3314,11 @@ v_cmpx_nlt_f64_e64 -|src_scc|, -|exec| v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_o_f16_e64 v1, v2 -// GFX12: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_o_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_o_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_o_f16_e64 v255, v255 -// GFX12: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_o_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_o_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] v_cmpx_o_f16_e64 s1, s2 // GFX12: v_cmpx_o_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00] @@ -3299,6 +3359,12 @@ v_cmpx_o_f16_e64 -src_scc, |vcc_lo| v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_o_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_o_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_o_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_o_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_o_f32_e64 v1, v2 // GFX12: v_cmpx_o_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00] @@ -3380,11 +3446,11 @@ v_cmpx_o_f64_e64 -|src_scc|, -|exec| v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp // GFX12: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] -v_cmpx_u_f16_e64 v1, v2 -// GFX12: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] +v_cmpx_u_f16_e64 v1.l, v2.l +// GFX12: v_cmpx_u_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] -v_cmpx_u_f16_e64 v255, v255 -// GFX12: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] +v_cmpx_u_f16_e64 v255.l, v255.l +// GFX12: v_cmpx_u_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] v_cmpx_u_f16_e64 s1, s2 // GFX12: v_cmpx_u_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00] @@ -3425,6 +3491,12 @@ v_cmpx_u_f16_e64 -src_scc, |vcc_lo| v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp // GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +v_cmpx_u_f16_e64 v1.h, v2.l +// GFX12: v_cmpx_u_f16_e64 v1.h, v2.l ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] + +v_cmpx_u_f16_e64 v255.l, v255.h +// GFX12: v_cmpx_u_f16_e64 v255.l, v255.h ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] + v_cmpx_u_f32_e64 v1, v2 // GFX12: v_cmpx_u_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s index 86f4b9a6789dd..aca2b4792f74a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s @@ -422,53 +422,62 @@ v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -728,53 +737,62 @@ v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_gt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1034,53 +1052,62 @@ v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_le_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1340,53 +1367,62 @@ v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1961,53 +1997,62 @@ v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_neq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_neq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2057,53 +2102,62 @@ v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_nge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2153,53 +2207,62 @@ v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2249,53 +2312,62 @@ v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nle_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_nle_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2345,53 +2417,62 @@ v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_nlg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2441,53 +2522,62 @@ v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] + +v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2537,53 +2627,62 @@ v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_o_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_o_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_o_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_o_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] + +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2633,53 +2732,62 @@ v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_u_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_u_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_cmpx_u_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_mirror -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] -v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] -v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] +v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13] -v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30] v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s index 071a00ac73b8a..3503f3d62d737 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s @@ -164,23 +164,32 @@ v_cmpx_eq_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -278,23 +287,32 @@ v_cmpx_ge_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_gt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -392,23 +410,32 @@ v_cmpx_gt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_le_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_le_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -506,23 +533,32 @@ v_cmpx_le_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -743,23 +779,32 @@ v_cmpx_ne_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_neq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -779,23 +824,32 @@ v_cmpx_neq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -815,23 +869,32 @@ v_cmpx_nge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x99,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -851,23 +914,32 @@ v_cmpx_ngt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nle_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -887,23 +959,32 @@ v_cmpx_nle_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nlg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -923,23 +1004,32 @@ v_cmpx_nlg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -959,23 +1049,32 @@ v_cmpx_nlt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_o_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_o_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_o_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_o_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -995,23 +1094,32 @@ v_cmpx_o_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x97,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] -v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] -v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_cmpx_u_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_u_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] -v_cmpx_u_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_u_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05] +v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] -v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s index ab01d37c39d3e..59634ba3cd64a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s @@ -554,50 +554,62 @@ v_cmpx_eq_u64 src_scc, v[2:3] v_cmpx_eq_u64 0xaf123456, v[254:255] // GFX12: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ge_f16 v1, v2 -// GFX12: v_cmpx_ge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0c,0x7d] +v_cmpx_ge_f16 v1.l, v2.l +// GFX12: v_cmpx_ge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0c,0x7d] -v_cmpx_ge_f16 v127, v2 -// GFX12: v_cmpx_ge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0c,0x7d] +v_cmpx_ge_f16 v127.l, v2.l +// GFX12: v_cmpx_ge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0c,0x7d] -v_cmpx_ge_f16 s1, v2 -// GFX12: v_cmpx_ge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0c,0x7d] +v_cmpx_ge_f16 s1, v2.l +// GFX12: v_cmpx_ge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0c,0x7d] -v_cmpx_ge_f16 s105, v2 -// GFX12: v_cmpx_ge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0c,0x7d] +v_cmpx_ge_f16 s105, v2.l +// GFX12: v_cmpx_ge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0c,0x7d] -v_cmpx_ge_f16 vcc_lo, v2 -// GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0c,0x7d] +v_cmpx_ge_f16 vcc_lo, v2.l +// GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0c,0x7d] -v_cmpx_ge_f16 vcc_hi, v2 -// GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0c,0x7d] +v_cmpx_ge_f16 vcc_hi, v2.l +// GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0c,0x7d] -v_cmpx_ge_f16 ttmp15, v2 -// GFX12: v_cmpx_ge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0c,0x7d] +v_cmpx_ge_f16 ttmp15, v2.l +// GFX12: v_cmpx_ge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0c,0x7d] -v_cmpx_ge_f16 m0, v2 -// GFX12: v_cmpx_ge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0c,0x7d] +v_cmpx_ge_f16 m0, v2.l +// GFX12: v_cmpx_ge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0c,0x7d] -v_cmpx_ge_f16 exec_lo, v2 -// GFX12: v_cmpx_ge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0c,0x7d] +v_cmpx_ge_f16 exec_lo, v2.l +// GFX12: v_cmpx_ge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0c,0x7d] -v_cmpx_ge_f16 exec_hi, v2 -// GFX12: v_cmpx_ge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0c,0x7d] +v_cmpx_ge_f16 exec_hi, v2.l +// GFX12: v_cmpx_ge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0c,0x7d] -v_cmpx_ge_f16 null, v2 -// GFX12: v_cmpx_ge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0c,0x7d] +v_cmpx_ge_f16 null, v2.l +// GFX12: v_cmpx_ge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0c,0x7d] -v_cmpx_ge_f16 -1, v2 -// GFX12: v_cmpx_ge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0c,0x7d] +v_cmpx_ge_f16 -1, v2.l +// GFX12: v_cmpx_ge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0c,0x7d] -v_cmpx_ge_f16 0.5, v2 -// GFX12: v_cmpx_ge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0c,0x7d] +v_cmpx_ge_f16 0.5, v2.l +// GFX12: v_cmpx_ge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0c,0x7d] -v_cmpx_ge_f16 src_scc, v2 -// GFX12: v_cmpx_ge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0c,0x7d] +v_cmpx_ge_f16 src_scc, v2.l +// GFX12: v_cmpx_ge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0c,0x7d] -v_cmpx_ge_f16 0xfe0b, v127 -// GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ge_f16 0xfe0b, v127.l +// GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_ge_f16 v1.h, v2.l +// GFX12: v_cmpx_ge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0c,0x7d] + +v_cmpx_ge_f16 v127.h, v2.l +// GFX12: v_cmpx_ge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0c,0x7d] + +v_cmpx_ge_f16 src_scc, v2.h +// GFX12: v_cmpx_ge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0d,0x7d] + +v_cmpx_ge_f16 0xfe0b, v127.h +// GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ge_f32 v1, v2 // GFX12: v_cmpx_ge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2c,0x7d] @@ -956,50 +968,62 @@ v_cmpx_ge_u64 src_scc, v[2:3] v_cmpx_ge_u64 0xaf123456, v[254:255] // GFX12: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_gt_f16 v1, v2 -// GFX12: v_cmpx_gt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x08,0x7d] +v_cmpx_gt_f16 v1.l, v2.l +// GFX12: v_cmpx_gt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x08,0x7d] + +v_cmpx_gt_f16 v127.l, v2.l +// GFX12: v_cmpx_gt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x08,0x7d] + +v_cmpx_gt_f16 s1, v2.l +// GFX12: v_cmpx_gt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x08,0x7d] -v_cmpx_gt_f16 v127, v2 -// GFX12: v_cmpx_gt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x08,0x7d] +v_cmpx_gt_f16 s105, v2.l +// GFX12: v_cmpx_gt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x08,0x7d] -v_cmpx_gt_f16 s1, v2 -// GFX12: v_cmpx_gt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x08,0x7d] +v_cmpx_gt_f16 vcc_lo, v2.l +// GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x08,0x7d] -v_cmpx_gt_f16 s105, v2 -// GFX12: v_cmpx_gt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x08,0x7d] +v_cmpx_gt_f16 vcc_hi, v2.l +// GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x08,0x7d] -v_cmpx_gt_f16 vcc_lo, v2 -// GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x08,0x7d] +v_cmpx_gt_f16 ttmp15, v2.l +// GFX12: v_cmpx_gt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x08,0x7d] -v_cmpx_gt_f16 vcc_hi, v2 -// GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x08,0x7d] +v_cmpx_gt_f16 m0, v2.l +// GFX12: v_cmpx_gt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x08,0x7d] -v_cmpx_gt_f16 ttmp15, v2 -// GFX12: v_cmpx_gt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x08,0x7d] +v_cmpx_gt_f16 exec_lo, v2.l +// GFX12: v_cmpx_gt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x08,0x7d] -v_cmpx_gt_f16 m0, v2 -// GFX12: v_cmpx_gt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x08,0x7d] +v_cmpx_gt_f16 exec_hi, v2.l +// GFX12: v_cmpx_gt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x08,0x7d] -v_cmpx_gt_f16 exec_lo, v2 -// GFX12: v_cmpx_gt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x08,0x7d] +v_cmpx_gt_f16 null, v2.l +// GFX12: v_cmpx_gt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x08,0x7d] -v_cmpx_gt_f16 exec_hi, v2 -// GFX12: v_cmpx_gt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x08,0x7d] +v_cmpx_gt_f16 -1, v2.l +// GFX12: v_cmpx_gt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x08,0x7d] -v_cmpx_gt_f16 null, v2 -// GFX12: v_cmpx_gt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x08,0x7d] +v_cmpx_gt_f16 0.5, v2.l +// GFX12: v_cmpx_gt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x08,0x7d] -v_cmpx_gt_f16 -1, v2 -// GFX12: v_cmpx_gt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x08,0x7d] +v_cmpx_gt_f16 src_scc, v2.l +// GFX12: v_cmpx_gt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x08,0x7d] -v_cmpx_gt_f16 0.5, v2 -// GFX12: v_cmpx_gt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x08,0x7d] +v_cmpx_gt_f16 0xfe0b, v127.l +// GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_gt_f16 src_scc, v2 -// GFX12: v_cmpx_gt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x08,0x7d] +v_cmpx_gt_f16 v1.h, v2.l +// GFX12: v_cmpx_gt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x08,0x7d] -v_cmpx_gt_f16 0xfe0b, v127 -// GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_gt_f16 v127.h, v2.l +// GFX12: v_cmpx_gt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x08,0x7d] + +v_cmpx_gt_f16 src_scc, v2.h +// GFX12: v_cmpx_gt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x09,0x7d] + +v_cmpx_gt_f16 0xfe0b, v127.h +// GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_gt_f32 v1, v2 // GFX12: v_cmpx_gt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x28,0x7d] @@ -1358,50 +1382,62 @@ v_cmpx_gt_u64 src_scc, v[2:3] v_cmpx_gt_u64 0xaf123456, v[254:255] // GFX12: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_le_f16 v1, v2 -// GFX12: v_cmpx_le_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x06,0x7d] +v_cmpx_le_f16 v1.l, v2.l +// GFX12: v_cmpx_le_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x06,0x7d] + +v_cmpx_le_f16 v127.l, v2.l +// GFX12: v_cmpx_le_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x06,0x7d] + +v_cmpx_le_f16 s1, v2.l +// GFX12: v_cmpx_le_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x06,0x7d] + +v_cmpx_le_f16 s105, v2.l +// GFX12: v_cmpx_le_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x06,0x7d] + +v_cmpx_le_f16 vcc_lo, v2.l +// GFX12: v_cmpx_le_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x06,0x7d] -v_cmpx_le_f16 v127, v2 -// GFX12: v_cmpx_le_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x06,0x7d] +v_cmpx_le_f16 vcc_hi, v2.l +// GFX12: v_cmpx_le_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x06,0x7d] -v_cmpx_le_f16 s1, v2 -// GFX12: v_cmpx_le_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x06,0x7d] +v_cmpx_le_f16 ttmp15, v2.l +// GFX12: v_cmpx_le_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x06,0x7d] -v_cmpx_le_f16 s105, v2 -// GFX12: v_cmpx_le_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x06,0x7d] +v_cmpx_le_f16 m0, v2.l +// GFX12: v_cmpx_le_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x06,0x7d] -v_cmpx_le_f16 vcc_lo, v2 -// GFX12: v_cmpx_le_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x06,0x7d] +v_cmpx_le_f16 exec_lo, v2.l +// GFX12: v_cmpx_le_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x06,0x7d] -v_cmpx_le_f16 vcc_hi, v2 -// GFX12: v_cmpx_le_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x06,0x7d] +v_cmpx_le_f16 exec_hi, v2.l +// GFX12: v_cmpx_le_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x06,0x7d] -v_cmpx_le_f16 ttmp15, v2 -// GFX12: v_cmpx_le_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x06,0x7d] +v_cmpx_le_f16 null, v2.l +// GFX12: v_cmpx_le_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x06,0x7d] -v_cmpx_le_f16 m0, v2 -// GFX12: v_cmpx_le_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x06,0x7d] +v_cmpx_le_f16 -1, v2.l +// GFX12: v_cmpx_le_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x06,0x7d] -v_cmpx_le_f16 exec_lo, v2 -// GFX12: v_cmpx_le_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x06,0x7d] +v_cmpx_le_f16 0.5, v2.l +// GFX12: v_cmpx_le_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x06,0x7d] -v_cmpx_le_f16 exec_hi, v2 -// GFX12: v_cmpx_le_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x06,0x7d] +v_cmpx_le_f16 src_scc, v2.l +// GFX12: v_cmpx_le_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x06,0x7d] -v_cmpx_le_f16 null, v2 -// GFX12: v_cmpx_le_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x06,0x7d] +v_cmpx_le_f16 0xfe0b, v127.l +// GFX12: v_cmpx_le_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_le_f16 -1, v2 -// GFX12: v_cmpx_le_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x06,0x7d] +v_cmpx_le_f16 v1.h, v2.l +// GFX12: v_cmpx_le_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x06,0x7d] -v_cmpx_le_f16 0.5, v2 -// GFX12: v_cmpx_le_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x06,0x7d] +v_cmpx_le_f16 v127.h, v2.l +// GFX12: v_cmpx_le_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x06,0x7d] -v_cmpx_le_f16 src_scc, v2 -// GFX12: v_cmpx_le_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x06,0x7d] +v_cmpx_le_f16 src_scc, v2.h +// GFX12: v_cmpx_le_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x07,0x7d] -v_cmpx_le_f16 0xfe0b, v127 -// GFX12: v_cmpx_le_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_le_f16 0xfe0b, v127.h +// GFX12: v_cmpx_le_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_le_f32 v1, v2 // GFX12: v_cmpx_le_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x26,0x7d] @@ -1760,50 +1796,62 @@ v_cmpx_le_u64 src_scc, v[2:3] v_cmpx_le_u64 0xaf123456, v[254:255] // GFX12: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_lg_f16 v1, v2 -// GFX12: v_cmpx_lg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0a,0x7d] +v_cmpx_lg_f16 v1.l, v2.l +// GFX12: v_cmpx_lg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x7d] -v_cmpx_lg_f16 v127, v2 -// GFX12: v_cmpx_lg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0a,0x7d] +v_cmpx_lg_f16 v127.l, v2.l +// GFX12: v_cmpx_lg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x7d] -v_cmpx_lg_f16 s1, v2 -// GFX12: v_cmpx_lg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0a,0x7d] +v_cmpx_lg_f16 s1, v2.l +// GFX12: v_cmpx_lg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0a,0x7d] -v_cmpx_lg_f16 s105, v2 -// GFX12: v_cmpx_lg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0a,0x7d] +v_cmpx_lg_f16 s105, v2.l +// GFX12: v_cmpx_lg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0a,0x7d] -v_cmpx_lg_f16 vcc_lo, v2 -// GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x7d] +v_cmpx_lg_f16 vcc_lo, v2.l +// GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x7d] -v_cmpx_lg_f16 vcc_hi, v2 -// GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x7d] +v_cmpx_lg_f16 vcc_hi, v2.l +// GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x7d] -v_cmpx_lg_f16 ttmp15, v2 -// GFX12: v_cmpx_lg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x7d] +v_cmpx_lg_f16 ttmp15, v2.l +// GFX12: v_cmpx_lg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x7d] -v_cmpx_lg_f16 m0, v2 -// GFX12: v_cmpx_lg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0a,0x7d] +v_cmpx_lg_f16 m0, v2.l +// GFX12: v_cmpx_lg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x7d] -v_cmpx_lg_f16 exec_lo, v2 -// GFX12: v_cmpx_lg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x7d] +v_cmpx_lg_f16 exec_lo, v2.l +// GFX12: v_cmpx_lg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x7d] -v_cmpx_lg_f16 exec_hi, v2 -// GFX12: v_cmpx_lg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x7d] +v_cmpx_lg_f16 exec_hi, v2.l +// GFX12: v_cmpx_lg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x7d] -v_cmpx_lg_f16 null, v2 -// GFX12: v_cmpx_lg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0a,0x7d] +v_cmpx_lg_f16 null, v2.l +// GFX12: v_cmpx_lg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0a,0x7d] -v_cmpx_lg_f16 -1, v2 -// GFX12: v_cmpx_lg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0a,0x7d] +v_cmpx_lg_f16 -1, v2.l +// GFX12: v_cmpx_lg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x7d] -v_cmpx_lg_f16 0.5, v2 -// GFX12: v_cmpx_lg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x7d] +v_cmpx_lg_f16 0.5, v2.l +// GFX12: v_cmpx_lg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x7d] -v_cmpx_lg_f16 src_scc, v2 -// GFX12: v_cmpx_lg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x7d] +v_cmpx_lg_f16 src_scc, v2.l +// GFX12: v_cmpx_lg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x7d] -v_cmpx_lg_f16 0xfe0b, v127 -// GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_lg_f16 0xfe0b, v127.l +// GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_lg_f16 v1.h, v2.l +// GFX12: v_cmpx_lg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x7d] + +v_cmpx_lg_f16 v127.h, v2.l +// GFX12: v_cmpx_lg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x7d] + +v_cmpx_lg_f16 src_scc, v2.h +// GFX12: v_cmpx_lg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x7d] + +v_cmpx_lg_f16 0xfe0b, v127.h +// GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_lg_f32 v1, v2 // GFX12: v_cmpx_lg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2a,0x7d] @@ -2576,50 +2624,62 @@ v_cmpx_ne_u64 src_scc, v[2:3] v_cmpx_ne_u64 0xaf123456, v[254:255] // GFX12: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_neq_f16 v1, v2 -// GFX12: v_cmpx_neq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1a,0x7d] +v_cmpx_neq_f16 v1.l, v2.l +// GFX12: v_cmpx_neq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1a,0x7d] + +v_cmpx_neq_f16 v127.l, v2.l +// GFX12: v_cmpx_neq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1a,0x7d] + +v_cmpx_neq_f16 s1, v2.l +// GFX12: v_cmpx_neq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1a,0x7d] -v_cmpx_neq_f16 v127, v2 -// GFX12: v_cmpx_neq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1a,0x7d] +v_cmpx_neq_f16 s105, v2.l +// GFX12: v_cmpx_neq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1a,0x7d] -v_cmpx_neq_f16 s1, v2 -// GFX12: v_cmpx_neq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1a,0x7d] +v_cmpx_neq_f16 vcc_lo, v2.l +// GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1a,0x7d] -v_cmpx_neq_f16 s105, v2 -// GFX12: v_cmpx_neq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1a,0x7d] +v_cmpx_neq_f16 vcc_hi, v2.l +// GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1a,0x7d] -v_cmpx_neq_f16 vcc_lo, v2 -// GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1a,0x7d] +v_cmpx_neq_f16 ttmp15, v2.l +// GFX12: v_cmpx_neq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1a,0x7d] -v_cmpx_neq_f16 vcc_hi, v2 -// GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1a,0x7d] +v_cmpx_neq_f16 m0, v2.l +// GFX12: v_cmpx_neq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1a,0x7d] -v_cmpx_neq_f16 ttmp15, v2 -// GFX12: v_cmpx_neq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1a,0x7d] +v_cmpx_neq_f16 exec_lo, v2.l +// GFX12: v_cmpx_neq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1a,0x7d] -v_cmpx_neq_f16 m0, v2 -// GFX12: v_cmpx_neq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1a,0x7d] +v_cmpx_neq_f16 exec_hi, v2.l +// GFX12: v_cmpx_neq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1a,0x7d] -v_cmpx_neq_f16 exec_lo, v2 -// GFX12: v_cmpx_neq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1a,0x7d] +v_cmpx_neq_f16 null, v2.l +// GFX12: v_cmpx_neq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1a,0x7d] -v_cmpx_neq_f16 exec_hi, v2 -// GFX12: v_cmpx_neq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1a,0x7d] +v_cmpx_neq_f16 -1, v2.l +// GFX12: v_cmpx_neq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1a,0x7d] -v_cmpx_neq_f16 null, v2 -// GFX12: v_cmpx_neq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1a,0x7d] +v_cmpx_neq_f16 0.5, v2.l +// GFX12: v_cmpx_neq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1a,0x7d] -v_cmpx_neq_f16 -1, v2 -// GFX12: v_cmpx_neq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1a,0x7d] +v_cmpx_neq_f16 src_scc, v2.l +// GFX12: v_cmpx_neq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1a,0x7d] -v_cmpx_neq_f16 0.5, v2 -// GFX12: v_cmpx_neq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1a,0x7d] +v_cmpx_neq_f16 0xfe0b, v127.l +// GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_neq_f16 src_scc, v2 -// GFX12: v_cmpx_neq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1a,0x7d] +v_cmpx_neq_f16 v1.h, v2.l +// GFX12: v_cmpx_neq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1a,0x7d] -v_cmpx_neq_f16 0xfe0b, v127 -// GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_neq_f16 v127.h, v2.l +// GFX12: v_cmpx_neq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1a,0x7d] + +v_cmpx_neq_f16 src_scc, v2.h +// GFX12: v_cmpx_neq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1b,0x7d] + +v_cmpx_neq_f16 0xfe0b, v127.h +// GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_neq_f32 v1, v2 // GFX12: v_cmpx_neq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3a,0x7d] @@ -2702,50 +2762,62 @@ v_cmpx_neq_f64 src_scc, v[2:3] v_cmpx_neq_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nge_f16 v1, v2 -// GFX12: v_cmpx_nge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x12,0x7d] +v_cmpx_nge_f16 v1.l, v2.l +// GFX12: v_cmpx_nge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x12,0x7d] + +v_cmpx_nge_f16 v127.l, v2.l +// GFX12: v_cmpx_nge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x12,0x7d] + +v_cmpx_nge_f16 s1, v2.l +// GFX12: v_cmpx_nge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x12,0x7d] + +v_cmpx_nge_f16 s105, v2.l +// GFX12: v_cmpx_nge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x12,0x7d] + +v_cmpx_nge_f16 vcc_lo, v2.l +// GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x12,0x7d] -v_cmpx_nge_f16 v127, v2 -// GFX12: v_cmpx_nge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x12,0x7d] +v_cmpx_nge_f16 vcc_hi, v2.l +// GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x12,0x7d] -v_cmpx_nge_f16 s1, v2 -// GFX12: v_cmpx_nge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x12,0x7d] +v_cmpx_nge_f16 ttmp15, v2.l +// GFX12: v_cmpx_nge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x12,0x7d] -v_cmpx_nge_f16 s105, v2 -// GFX12: v_cmpx_nge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x12,0x7d] +v_cmpx_nge_f16 m0, v2.l +// GFX12: v_cmpx_nge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x12,0x7d] -v_cmpx_nge_f16 vcc_lo, v2 -// GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x12,0x7d] +v_cmpx_nge_f16 exec_lo, v2.l +// GFX12: v_cmpx_nge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x12,0x7d] -v_cmpx_nge_f16 vcc_hi, v2 -// GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x12,0x7d] +v_cmpx_nge_f16 exec_hi, v2.l +// GFX12: v_cmpx_nge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x12,0x7d] -v_cmpx_nge_f16 ttmp15, v2 -// GFX12: v_cmpx_nge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x12,0x7d] +v_cmpx_nge_f16 null, v2.l +// GFX12: v_cmpx_nge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x12,0x7d] -v_cmpx_nge_f16 m0, v2 -// GFX12: v_cmpx_nge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x12,0x7d] +v_cmpx_nge_f16 -1, v2.l +// GFX12: v_cmpx_nge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x12,0x7d] -v_cmpx_nge_f16 exec_lo, v2 -// GFX12: v_cmpx_nge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x12,0x7d] +v_cmpx_nge_f16 0.5, v2.l +// GFX12: v_cmpx_nge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x12,0x7d] -v_cmpx_nge_f16 exec_hi, v2 -// GFX12: v_cmpx_nge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x12,0x7d] +v_cmpx_nge_f16 src_scc, v2.l +// GFX12: v_cmpx_nge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x12,0x7d] -v_cmpx_nge_f16 null, v2 -// GFX12: v_cmpx_nge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x12,0x7d] +v_cmpx_nge_f16 0xfe0b, v127.l +// GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_nge_f16 -1, v2 -// GFX12: v_cmpx_nge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x12,0x7d] +v_cmpx_nge_f16 v1.h, v2.l +// GFX12: v_cmpx_nge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x12,0x7d] -v_cmpx_nge_f16 0.5, v2 -// GFX12: v_cmpx_nge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x12,0x7d] +v_cmpx_nge_f16 v127.h, v2.l +// GFX12: v_cmpx_nge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x12,0x7d] -v_cmpx_nge_f16 src_scc, v2 -// GFX12: v_cmpx_nge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x12,0x7d] +v_cmpx_nge_f16 src_scc, v2.h +// GFX12: v_cmpx_nge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x13,0x7d] -v_cmpx_nge_f16 0xfe0b, v127 -// GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nge_f16 0xfe0b, v127.h +// GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nge_f32 v1, v2 // GFX12: v_cmpx_nge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x32,0x7d] @@ -2828,50 +2900,62 @@ v_cmpx_nge_f64 src_scc, v[2:3] v_cmpx_nge_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_ngt_f16 v1, v2 -// GFX12: v_cmpx_ngt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x16,0x7d] +v_cmpx_ngt_f16 v1.l, v2.l +// GFX12: v_cmpx_ngt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x16,0x7d] -v_cmpx_ngt_f16 v127, v2 -// GFX12: v_cmpx_ngt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x16,0x7d] +v_cmpx_ngt_f16 v127.l, v2.l +// GFX12: v_cmpx_ngt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x16,0x7d] -v_cmpx_ngt_f16 s1, v2 -// GFX12: v_cmpx_ngt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x16,0x7d] +v_cmpx_ngt_f16 s1, v2.l +// GFX12: v_cmpx_ngt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x16,0x7d] -v_cmpx_ngt_f16 s105, v2 -// GFX12: v_cmpx_ngt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x16,0x7d] +v_cmpx_ngt_f16 s105, v2.l +// GFX12: v_cmpx_ngt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x16,0x7d] -v_cmpx_ngt_f16 vcc_lo, v2 -// GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x16,0x7d] +v_cmpx_ngt_f16 vcc_lo, v2.l +// GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x16,0x7d] -v_cmpx_ngt_f16 vcc_hi, v2 -// GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x16,0x7d] +v_cmpx_ngt_f16 vcc_hi, v2.l +// GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x16,0x7d] -v_cmpx_ngt_f16 ttmp15, v2 -// GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x16,0x7d] +v_cmpx_ngt_f16 ttmp15, v2.l +// GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x16,0x7d] -v_cmpx_ngt_f16 m0, v2 -// GFX12: v_cmpx_ngt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x16,0x7d] +v_cmpx_ngt_f16 m0, v2.l +// GFX12: v_cmpx_ngt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x16,0x7d] -v_cmpx_ngt_f16 exec_lo, v2 -// GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x16,0x7d] +v_cmpx_ngt_f16 exec_lo, v2.l +// GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x16,0x7d] -v_cmpx_ngt_f16 exec_hi, v2 -// GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x16,0x7d] +v_cmpx_ngt_f16 exec_hi, v2.l +// GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x16,0x7d] -v_cmpx_ngt_f16 null, v2 -// GFX12: v_cmpx_ngt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x16,0x7d] +v_cmpx_ngt_f16 null, v2.l +// GFX12: v_cmpx_ngt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x16,0x7d] -v_cmpx_ngt_f16 -1, v2 -// GFX12: v_cmpx_ngt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x16,0x7d] +v_cmpx_ngt_f16 -1, v2.l +// GFX12: v_cmpx_ngt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x16,0x7d] -v_cmpx_ngt_f16 0.5, v2 -// GFX12: v_cmpx_ngt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x16,0x7d] +v_cmpx_ngt_f16 0.5, v2.l +// GFX12: v_cmpx_ngt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x16,0x7d] -v_cmpx_ngt_f16 src_scc, v2 -// GFX12: v_cmpx_ngt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x16,0x7d] +v_cmpx_ngt_f16 src_scc, v2.l +// GFX12: v_cmpx_ngt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x16,0x7d] -v_cmpx_ngt_f16 0xfe0b, v127 -// GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_ngt_f16 0xfe0b, v127.l +// GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_ngt_f16 v1.h, v2.l +// GFX12: v_cmpx_ngt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x16,0x7d] + +v_cmpx_ngt_f16 v127.h, v2.l +// GFX12: v_cmpx_ngt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x16,0x7d] + +v_cmpx_ngt_f16 src_scc, v2.h +// GFX12: v_cmpx_ngt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x17,0x7d] + +v_cmpx_ngt_f16 0xfe0b, v127.h +// GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_ngt_f32 v1, v2 // GFX12: v_cmpx_ngt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x36,0x7d] @@ -2954,50 +3038,62 @@ v_cmpx_ngt_f64 src_scc, v[2:3] v_cmpx_ngt_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nle_f16 v1, v2 -// GFX12: v_cmpx_nle_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x18,0x7d] +v_cmpx_nle_f16 v1.l, v2.l +// GFX12: v_cmpx_nle_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x18,0x7d] + +v_cmpx_nle_f16 v127.l, v2.l +// GFX12: v_cmpx_nle_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x18,0x7d] + +v_cmpx_nle_f16 s1, v2.l +// GFX12: v_cmpx_nle_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x18,0x7d] -v_cmpx_nle_f16 v127, v2 -// GFX12: v_cmpx_nle_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x18,0x7d] +v_cmpx_nle_f16 s105, v2.l +// GFX12: v_cmpx_nle_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x18,0x7d] -v_cmpx_nle_f16 s1, v2 -// GFX12: v_cmpx_nle_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x18,0x7d] +v_cmpx_nle_f16 vcc_lo, v2.l +// GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x18,0x7d] -v_cmpx_nle_f16 s105, v2 -// GFX12: v_cmpx_nle_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x18,0x7d] +v_cmpx_nle_f16 vcc_hi, v2.l +// GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x18,0x7d] -v_cmpx_nle_f16 vcc_lo, v2 -// GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x18,0x7d] +v_cmpx_nle_f16 ttmp15, v2.l +// GFX12: v_cmpx_nle_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x18,0x7d] -v_cmpx_nle_f16 vcc_hi, v2 -// GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x18,0x7d] +v_cmpx_nle_f16 m0, v2.l +// GFX12: v_cmpx_nle_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x18,0x7d] -v_cmpx_nle_f16 ttmp15, v2 -// GFX12: v_cmpx_nle_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x18,0x7d] +v_cmpx_nle_f16 exec_lo, v2.l +// GFX12: v_cmpx_nle_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x18,0x7d] -v_cmpx_nle_f16 m0, v2 -// GFX12: v_cmpx_nle_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x18,0x7d] +v_cmpx_nle_f16 exec_hi, v2.l +// GFX12: v_cmpx_nle_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x18,0x7d] -v_cmpx_nle_f16 exec_lo, v2 -// GFX12: v_cmpx_nle_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x18,0x7d] +v_cmpx_nle_f16 null, v2.l +// GFX12: v_cmpx_nle_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x18,0x7d] -v_cmpx_nle_f16 exec_hi, v2 -// GFX12: v_cmpx_nle_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x18,0x7d] +v_cmpx_nle_f16 -1, v2.l +// GFX12: v_cmpx_nle_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x18,0x7d] -v_cmpx_nle_f16 null, v2 -// GFX12: v_cmpx_nle_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x18,0x7d] +v_cmpx_nle_f16 0.5, v2.l +// GFX12: v_cmpx_nle_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x18,0x7d] -v_cmpx_nle_f16 -1, v2 -// GFX12: v_cmpx_nle_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x18,0x7d] +v_cmpx_nle_f16 src_scc, v2.l +// GFX12: v_cmpx_nle_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x18,0x7d] -v_cmpx_nle_f16 0.5, v2 -// GFX12: v_cmpx_nle_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x18,0x7d] +v_cmpx_nle_f16 0xfe0b, v127.l +// GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_nle_f16 src_scc, v2 -// GFX12: v_cmpx_nle_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x18,0x7d] +v_cmpx_nle_f16 v1.h, v2.l +// GFX12: v_cmpx_nle_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x18,0x7d] -v_cmpx_nle_f16 0xfe0b, v127 -// GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nle_f16 v127.h, v2.l +// GFX12: v_cmpx_nle_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x18,0x7d] + +v_cmpx_nle_f16 src_scc, v2.h +// GFX12: v_cmpx_nle_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x19,0x7d] + +v_cmpx_nle_f16 0xfe0b, v127.h +// GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nle_f32 v1, v2 // GFX12: v_cmpx_nle_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x38,0x7d] @@ -3080,50 +3176,62 @@ v_cmpx_nle_f64 src_scc, v[2:3] v_cmpx_nle_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nlg_f16 v1, v2 -// GFX12: v_cmpx_nlg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x14,0x7d] +v_cmpx_nlg_f16 v1.l, v2.l +// GFX12: v_cmpx_nlg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x14,0x7d] + +v_cmpx_nlg_f16 v127.l, v2.l +// GFX12: v_cmpx_nlg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x14,0x7d] + +v_cmpx_nlg_f16 s1, v2.l +// GFX12: v_cmpx_nlg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x14,0x7d] + +v_cmpx_nlg_f16 s105, v2.l +// GFX12: v_cmpx_nlg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x14,0x7d] + +v_cmpx_nlg_f16 vcc_lo, v2.l +// GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x14,0x7d] -v_cmpx_nlg_f16 v127, v2 -// GFX12: v_cmpx_nlg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x14,0x7d] +v_cmpx_nlg_f16 vcc_hi, v2.l +// GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x14,0x7d] -v_cmpx_nlg_f16 s1, v2 -// GFX12: v_cmpx_nlg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x14,0x7d] +v_cmpx_nlg_f16 ttmp15, v2.l +// GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x14,0x7d] -v_cmpx_nlg_f16 s105, v2 -// GFX12: v_cmpx_nlg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x14,0x7d] +v_cmpx_nlg_f16 m0, v2.l +// GFX12: v_cmpx_nlg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x14,0x7d] -v_cmpx_nlg_f16 vcc_lo, v2 -// GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x14,0x7d] +v_cmpx_nlg_f16 exec_lo, v2.l +// GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x14,0x7d] -v_cmpx_nlg_f16 vcc_hi, v2 -// GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x14,0x7d] +v_cmpx_nlg_f16 exec_hi, v2.l +// GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x14,0x7d] -v_cmpx_nlg_f16 ttmp15, v2 -// GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x14,0x7d] +v_cmpx_nlg_f16 null, v2.l +// GFX12: v_cmpx_nlg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x14,0x7d] -v_cmpx_nlg_f16 m0, v2 -// GFX12: v_cmpx_nlg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x14,0x7d] +v_cmpx_nlg_f16 -1, v2.l +// GFX12: v_cmpx_nlg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x14,0x7d] -v_cmpx_nlg_f16 exec_lo, v2 -// GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x14,0x7d] +v_cmpx_nlg_f16 0.5, v2.l +// GFX12: v_cmpx_nlg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x14,0x7d] -v_cmpx_nlg_f16 exec_hi, v2 -// GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x14,0x7d] +v_cmpx_nlg_f16 src_scc, v2.l +// GFX12: v_cmpx_nlg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x14,0x7d] -v_cmpx_nlg_f16 null, v2 -// GFX12: v_cmpx_nlg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x14,0x7d] +v_cmpx_nlg_f16 0xfe0b, v127.l +// GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_nlg_f16 -1, v2 -// GFX12: v_cmpx_nlg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x14,0x7d] +v_cmpx_nlg_f16 v1.h, v2.l +// GFX12: v_cmpx_nlg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x14,0x7d] -v_cmpx_nlg_f16 0.5, v2 -// GFX12: v_cmpx_nlg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x14,0x7d] +v_cmpx_nlg_f16 v127.h, v2.l +// GFX12: v_cmpx_nlg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x14,0x7d] -v_cmpx_nlg_f16 src_scc, v2 -// GFX12: v_cmpx_nlg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x14,0x7d] +v_cmpx_nlg_f16 src_scc, v2.h +// GFX12: v_cmpx_nlg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x15,0x7d] -v_cmpx_nlg_f16 0xfe0b, v127 -// GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nlg_f16 0xfe0b, v127.h +// GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nlg_f32 v1, v2 // GFX12: v_cmpx_nlg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x34,0x7d] @@ -3206,50 +3314,62 @@ v_cmpx_nlg_f64 src_scc, v[2:3] v_cmpx_nlg_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_nlt_f16 v1, v2 -// GFX12: v_cmpx_nlt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1c,0x7d] +v_cmpx_nlt_f16 v1.l, v2.l +// GFX12: v_cmpx_nlt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1c,0x7d] -v_cmpx_nlt_f16 v127, v2 -// GFX12: v_cmpx_nlt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1c,0x7d] +v_cmpx_nlt_f16 v127.l, v2.l +// GFX12: v_cmpx_nlt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1c,0x7d] -v_cmpx_nlt_f16 s1, v2 -// GFX12: v_cmpx_nlt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 s1, v2.l +// GFX12: v_cmpx_nlt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 s105, v2 -// GFX12: v_cmpx_nlt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 s105, v2.l +// GFX12: v_cmpx_nlt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 vcc_lo, v2 -// GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 vcc_lo, v2.l +// GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 vcc_hi, v2 -// GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 vcc_hi, v2.l +// GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 ttmp15, v2 -// GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 ttmp15, v2.l +// GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 m0, v2 -// GFX12: v_cmpx_nlt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 m0, v2.l +// GFX12: v_cmpx_nlt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 exec_lo, v2 -// GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 exec_lo, v2.l +// GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 exec_hi, v2 -// GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 exec_hi, v2.l +// GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 null, v2 -// GFX12: v_cmpx_nlt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 null, v2.l +// GFX12: v_cmpx_nlt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 -1, v2 -// GFX12: v_cmpx_nlt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 -1, v2.l +// GFX12: v_cmpx_nlt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 0.5, v2 -// GFX12: v_cmpx_nlt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 0.5, v2.l +// GFX12: v_cmpx_nlt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 src_scc, v2 -// GFX12: v_cmpx_nlt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1c,0x7d] +v_cmpx_nlt_f16 src_scc, v2.l +// GFX12: v_cmpx_nlt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1c,0x7d] -v_cmpx_nlt_f16 0xfe0b, v127 -// GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_nlt_f16 0xfe0b, v127.l +// GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] + +v_cmpx_nlt_f16 v1.h, v2.l +// GFX12: v_cmpx_nlt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1c,0x7d] + +v_cmpx_nlt_f16 v127.h, v2.l +// GFX12: v_cmpx_nlt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1c,0x7d] + +v_cmpx_nlt_f16 src_scc, v2.h +// GFX12: v_cmpx_nlt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1d,0x7d] + +v_cmpx_nlt_f16 0xfe0b, v127.h +// GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_nlt_f32 v1, v2 // GFX12: v_cmpx_nlt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3c,0x7d] @@ -3332,50 +3452,62 @@ v_cmpx_nlt_f64 src_scc, v[2:3] v_cmpx_nlt_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_o_f16 v1, v2 -// GFX12: v_cmpx_o_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0e,0x7d] +v_cmpx_o_f16 v1.l, v2.l +// GFX12: v_cmpx_o_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0e,0x7d] + +v_cmpx_o_f16 v127.l, v2.l +// GFX12: v_cmpx_o_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0e,0x7d] + +v_cmpx_o_f16 s1, v2.l +// GFX12: v_cmpx_o_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0e,0x7d] -v_cmpx_o_f16 v127, v2 -// GFX12: v_cmpx_o_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0e,0x7d] +v_cmpx_o_f16 s105, v2.l +// GFX12: v_cmpx_o_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0e,0x7d] -v_cmpx_o_f16 s1, v2 -// GFX12: v_cmpx_o_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0e,0x7d] +v_cmpx_o_f16 vcc_lo, v2.l +// GFX12: v_cmpx_o_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0e,0x7d] -v_cmpx_o_f16 s105, v2 -// GFX12: v_cmpx_o_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0e,0x7d] +v_cmpx_o_f16 vcc_hi, v2.l +// GFX12: v_cmpx_o_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0e,0x7d] -v_cmpx_o_f16 vcc_lo, v2 -// GFX12: v_cmpx_o_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0e,0x7d] +v_cmpx_o_f16 ttmp15, v2.l +// GFX12: v_cmpx_o_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0e,0x7d] -v_cmpx_o_f16 vcc_hi, v2 -// GFX12: v_cmpx_o_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0e,0x7d] +v_cmpx_o_f16 m0, v2.l +// GFX12: v_cmpx_o_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0e,0x7d] -v_cmpx_o_f16 ttmp15, v2 -// GFX12: v_cmpx_o_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0e,0x7d] +v_cmpx_o_f16 exec_lo, v2.l +// GFX12: v_cmpx_o_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0e,0x7d] -v_cmpx_o_f16 m0, v2 -// GFX12: v_cmpx_o_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0e,0x7d] +v_cmpx_o_f16 exec_hi, v2.l +// GFX12: v_cmpx_o_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0e,0x7d] -v_cmpx_o_f16 exec_lo, v2 -// GFX12: v_cmpx_o_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0e,0x7d] +v_cmpx_o_f16 null, v2.l +// GFX12: v_cmpx_o_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0e,0x7d] -v_cmpx_o_f16 exec_hi, v2 -// GFX12: v_cmpx_o_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0e,0x7d] +v_cmpx_o_f16 -1, v2.l +// GFX12: v_cmpx_o_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0e,0x7d] -v_cmpx_o_f16 null, v2 -// GFX12: v_cmpx_o_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0e,0x7d] +v_cmpx_o_f16 0.5, v2.l +// GFX12: v_cmpx_o_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0e,0x7d] -v_cmpx_o_f16 -1, v2 -// GFX12: v_cmpx_o_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0e,0x7d] +v_cmpx_o_f16 src_scc, v2.l +// GFX12: v_cmpx_o_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0e,0x7d] -v_cmpx_o_f16 0.5, v2 -// GFX12: v_cmpx_o_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0e,0x7d] +v_cmpx_o_f16 0xfe0b, v127.l +// GFX12: v_cmpx_o_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_o_f16 src_scc, v2 -// GFX12: v_cmpx_o_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0e,0x7d] +v_cmpx_o_f16 v1.h, v2.l +// GFX12: v_cmpx_o_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0e,0x7d] -v_cmpx_o_f16 0xfe0b, v127 -// GFX12: v_cmpx_o_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_o_f16 v127.h, v2.l +// GFX12: v_cmpx_o_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0e,0x7d] + +v_cmpx_o_f16 src_scc, v2.h +// GFX12: v_cmpx_o_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0f,0x7d] + +v_cmpx_o_f16 0xfe0b, v127.h +// GFX12: v_cmpx_o_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_o_f32 v1, v2 // GFX12: v_cmpx_o_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2e,0x7d] @@ -3458,50 +3590,62 @@ v_cmpx_o_f64 src_scc, v[2:3] v_cmpx_o_f64 0xaf123456, v[254:255] // GFX12: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf] -v_cmpx_u_f16 v1, v2 -// GFX12: v_cmpx_u_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x10,0x7d] +v_cmpx_u_f16 v1.l, v2.l +// GFX12: v_cmpx_u_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x10,0x7d] + +v_cmpx_u_f16 v127.l, v2.l +// GFX12: v_cmpx_u_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x10,0x7d] + +v_cmpx_u_f16 s1, v2.l +// GFX12: v_cmpx_u_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x10,0x7d] + +v_cmpx_u_f16 s105, v2.l +// GFX12: v_cmpx_u_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x10,0x7d] + +v_cmpx_u_f16 vcc_lo, v2.l +// GFX12: v_cmpx_u_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x10,0x7d] -v_cmpx_u_f16 v127, v2 -// GFX12: v_cmpx_u_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x10,0x7d] +v_cmpx_u_f16 vcc_hi, v2.l +// GFX12: v_cmpx_u_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x10,0x7d] -v_cmpx_u_f16 s1, v2 -// GFX12: v_cmpx_u_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x10,0x7d] +v_cmpx_u_f16 ttmp15, v2.l +// GFX12: v_cmpx_u_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x10,0x7d] -v_cmpx_u_f16 s105, v2 -// GFX12: v_cmpx_u_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x10,0x7d] +v_cmpx_u_f16 m0, v2.l +// GFX12: v_cmpx_u_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x10,0x7d] -v_cmpx_u_f16 vcc_lo, v2 -// GFX12: v_cmpx_u_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x10,0x7d] +v_cmpx_u_f16 exec_lo, v2.l +// GFX12: v_cmpx_u_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x10,0x7d] -v_cmpx_u_f16 vcc_hi, v2 -// GFX12: v_cmpx_u_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x10,0x7d] +v_cmpx_u_f16 exec_hi, v2.l +// GFX12: v_cmpx_u_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x10,0x7d] -v_cmpx_u_f16 ttmp15, v2 -// GFX12: v_cmpx_u_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x10,0x7d] +v_cmpx_u_f16 null, v2.l +// GFX12: v_cmpx_u_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x10,0x7d] -v_cmpx_u_f16 m0, v2 -// GFX12: v_cmpx_u_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x10,0x7d] +v_cmpx_u_f16 -1, v2.l +// GFX12: v_cmpx_u_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x10,0x7d] -v_cmpx_u_f16 exec_lo, v2 -// GFX12: v_cmpx_u_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x10,0x7d] +v_cmpx_u_f16 0.5, v2.l +// GFX12: v_cmpx_u_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x10,0x7d] -v_cmpx_u_f16 exec_hi, v2 -// GFX12: v_cmpx_u_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x10,0x7d] +v_cmpx_u_f16 src_scc, v2.l +// GFX12: v_cmpx_u_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x10,0x7d] -v_cmpx_u_f16 null, v2 -// GFX12: v_cmpx_u_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x10,0x7d] +v_cmpx_u_f16 0xfe0b, v127.l +// GFX12: v_cmpx_u_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] -v_cmpx_u_f16 -1, v2 -// GFX12: v_cmpx_u_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] +v_cmpx_u_f16 v1.h, v2.l +// GFX12: v_cmpx_u_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x10,0x7d] -v_cmpx_u_f16 0.5, v2 -// GFX12: v_cmpx_u_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x10,0x7d] +v_cmpx_u_f16 v127.h, v2.l +// GFX12: v_cmpx_u_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x10,0x7d] -v_cmpx_u_f16 src_scc, v2 -// GFX12: v_cmpx_u_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x10,0x7d] +v_cmpx_u_f16 src_scc, v2.h +// GFX12: v_cmpx_u_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x11,0x7d] -v_cmpx_u_f16 0xfe0b, v127 -// GFX12: v_cmpx_u_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] +v_cmpx_u_f16 0xfe0b, v127.h +// GFX12: v_cmpx_u_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00] v_cmpx_u_f32 v1, v2 // GFX12: v_cmpx_u_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x30,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s index 2b919fa9d671e..3a3a89b8a9932 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s @@ -362,47 +362,53 @@ v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_ge_f16 v1, v2 row_mirror -// GFX12: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff] @@ -626,47 +632,53 @@ v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_mirror -// GFX12: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff] @@ -890,47 +902,53 @@ v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_le_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_le_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_mirror -// GFX12: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_le_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff] @@ -1154,47 +1172,53 @@ v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_lg_f16 v1, v2 row_mirror -// GFX12: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff] @@ -1688,47 +1712,53 @@ v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x05,0x30] -v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_mirror -// GFX12: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff] @@ -1772,47 +1802,53 @@ v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_nge_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_mirror -// GFX12: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nge_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff] @@ -1856,47 +1892,53 @@ v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_ngt_f16 v1, v2 row_mirror -// GFX12: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff] @@ -1940,47 +1982,53 @@ v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_mirror -// GFX12: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff] @@ -2024,47 +2072,53 @@ v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_nlg_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_mirror -// GFX12: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nlg_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff] @@ -2108,47 +2162,53 @@ v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_nlt_f16 v1, v2 row_mirror -// GFX12: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30] + +v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff] @@ -2192,47 +2252,53 @@ v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] -v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_o_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_mirror -// GFX12: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x09,0x13] + +v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff] @@ -2276,47 +2342,53 @@ v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xf5,0x30] -v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] + +v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] + +v_cmpx_u_f16 v1.l, v2.l row_mirror +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] -v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] -// GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] +v_cmpx_u_f16 v1.l, v2.l row_half_mirror +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_mirror -// GFX12: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shl:1 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_half_mirror -// GFX12: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shl:15 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shl:1 -// GFX12: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shr:1 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shl:15 -// GFX12: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_shr:15 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shr:1 -// GFX12: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_ror:1 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_shr:15 -// GFX12: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_ror:15 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_ror:1 -// GFX12: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] -v_cmpx_u_f16 v1, v2 row_ror:15 -// GFX12: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] -v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] +v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX12: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13] -v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] +v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 +// GFX12: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30] -v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13] +v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x09,0x13] -v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30] +v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xf5,0x30] v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] // GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s index 11579786d78a8..0f30751003373 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s @@ -98,14 +98,20 @@ v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05] @@ -164,14 +170,20 @@ v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05] @@ -230,14 +242,20 @@ v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05] @@ -296,14 +314,20 @@ v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05] @@ -434,14 +458,20 @@ v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05] @@ -452,14 +482,20 @@ v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05] @@ -470,14 +506,20 @@ v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05] @@ -488,14 +530,20 @@ v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05] @@ -506,14 +554,20 @@ v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05] @@ -524,14 +578,20 @@ v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] + +v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05] @@ -542,14 +602,20 @@ v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] -v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] + +v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05] @@ -560,14 +626,20 @@ v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00] -v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] + +v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] -v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] -v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] +v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s index 265ab2c8ff66d..58c355ed56ab1 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s @@ -145,23 +145,41 @@ v_cmpx_eq_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_ge_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_ge_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_ge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_ge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_ge_i16_e32 v1.h, v255.h // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -235,23 +253,41 @@ v_cmpx_ge_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_gt_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_gt_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_gt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_gt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_gt_i16_e32 v1.h, v255.h // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -325,23 +361,41 @@ v_cmpx_gt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_le_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_le_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_le_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_le_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_le_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_le_i16_e32 v1.h, v255.h // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -415,23 +469,41 @@ v_cmpx_le_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lg_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_lg_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction -v_cmpx_lg_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction +v_cmpx_lg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction + +v_cmpx_lg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_cmpx_lt_f16_e32 v1.h, v255.h // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction @@ -613,146 +685,290 @@ v_cmpx_ne_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ne_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_neq_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_neq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_ngt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nle_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction + +v_cmpx_nlg_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlg_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_neq_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlg_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlg_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nge_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlt_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlt_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_ngt_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlt_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_nlt_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nle_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_nlt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_o_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_o_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_nlg_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_o_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_o_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_nlt_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction +v_cmpx_o_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_o_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_u_f16_e32 v1.h, v255.h +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_o_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_o_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_o_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_u_f16_e32 v1.l, v255.l +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_o_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_o_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction -v_cmpx_u_f16_e32 v1, v255 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_u_f16_e32 v255.h, v2.h +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_u_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_u_f16_e32 v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_u_f16_e32 v255, v2 -// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_cmpx_u_f16_e32 v255.l, v2.l +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_u_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction -v_cmpx_u_f16_e32 v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction +v_cmpx_u_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s index ed228c061d019..3d02c95c94ac0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s @@ -145,23 +145,41 @@ v_cmpx_eq_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ge_f16 v1, v255 -// GFX12: v_cmpx_ge_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_ge_f16 v1.h, v255.h +// GFX12: v_cmpx_ge_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_ge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_ge_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ge_f16 v255, v2 -// GFX12: v_cmpx_ge_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_ge_f16 v1.l, v255.l +// GFX12: v_cmpx_ge_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_ge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ge_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_ge_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v255.h, v2.h +// GFX12: v_cmpx_ge_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ge_f16 v255.l, v2.l +// GFX12: v_cmpx_ge_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ge_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_ge_i16 v1.h, v255.h // GFX12: v_cmpx_ge_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00] @@ -235,23 +253,41 @@ v_cmpx_ge_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_gt_f16 v1, v255 -// GFX12: v_cmpx_gt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_gt_f16 v1.h, v255.h +// GFX12: v_cmpx_gt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_gt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_gt_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_gt_f16 v1.l, v255.l +// GFX12: v_cmpx_gt_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_gt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_gt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_gt_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_gt_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_gt_f16 v255, v2 -// GFX12: v_cmpx_gt_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_gt_f16 v255.h, v2.h +// GFX12: v_cmpx_gt_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_gt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_gt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_gt_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_gt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_gt_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_gt_f16 v255.l, v2.l +// GFX12: v_cmpx_gt_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_gt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_gt_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_gt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_gt_i16 v1.h, v255.h // GFX12: v_cmpx_gt_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00] @@ -325,23 +361,41 @@ v_cmpx_gt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_f16 v1, v255 -// GFX12: v_cmpx_le_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_le_f16 v1.h, v255.h +// GFX12: v_cmpx_le_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16 v1.l, v255.l +// GFX12: v_cmpx_le_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_le_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_le_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_le_f16 v255.h, v2.h +// GFX12: v_cmpx_le_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_le_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_le_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_le_f16 v255, v2 -// GFX12: v_cmpx_le_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_le_f16 v255.l, v2.l +// GFX12: v_cmpx_le_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_le_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_le_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_le_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_le_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_le_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_le_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_le_i16 v1.h, v255.h // GFX12: v_cmpx_le_i16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00] @@ -415,23 +469,41 @@ v_cmpx_le_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_lg_f16 v1, v255 -// GFX12: v_cmpx_lg_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_lg_f16 v1.h, v255.h +// GFX12: v_cmpx_lg_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_lg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_lg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_lg_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_lg_f16 v255, v2 -// GFX12: v_cmpx_lg_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_lg_f16 v1.l, v255.l +// GFX12: v_cmpx_lg_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_lg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_lg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_lg_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_lg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_lg_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_lg_f16 v255.h, v2.h +// GFX12: v_cmpx_lg_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lg_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_lg_f16 v255.l, v2.l +// GFX12: v_cmpx_lg_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_lg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_lg_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_lg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] v_cmpx_lt_f16 v1.h, v255.h // GFX12: v_cmpx_lt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00] @@ -613,146 +685,290 @@ v_cmpx_ne_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0] // GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_neq_f16 v1, v255 -// GFX12: v_cmpx_neq_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_neq_f16 v1.h, v255.h +// GFX12: v_cmpx_neq_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_neq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v1.l, v255.l +// GFX12: v_cmpx_neq_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_neq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_neq_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v255.h, v2.h +// GFX12: v_cmpx_neq_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_neq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_neq_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_neq_f16 v255.l, v2.l +// GFX12: v_cmpx_neq_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_neq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_neq_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_neq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v1.h, v255.h +// GFX12: v_cmpx_nge_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v1.l, v255.l +// GFX12: v_cmpx_nge_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nge_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v255.h, v2.h +// GFX12: v_cmpx_nge_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nge_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nge_f16 v255.l, v2.l +// GFX12: v_cmpx_nge_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nge_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v1.h, v255.h +// GFX12: v_cmpx_ngt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ngt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v1.l, v255.l +// GFX12: v_cmpx_ngt_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_ngt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v255.h, v2.h +// GFX12: v_cmpx_ngt_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ngt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_ngt_f16 v255.l, v2.l +// GFX12: v_cmpx_ngt_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_ngt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_ngt_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v1.h, v255.h +// GFX12: v_cmpx_nle_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nle_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v1.l, v255.l +// GFX12: v_cmpx_nle_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00] + +v_cmpx_nle_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_cmpx_nle_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v255.h, v2.h +// GFX12: v_cmpx_nle_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nle_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nle_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nle_f16 v255.l, v2.l +// GFX12: v_cmpx_nle_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00] + +v_cmpx_nle_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_cmpx_nle_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nle_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_cmpx_nlg_f16 v1.h, v255.h +// GFX12: v_cmpx_nlg_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_neq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_neq_f16 v255, v2 -// GFX12: v_cmpx_neq_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_nlg_f16 v1.l, v255.l +// GFX12: v_cmpx_nlg_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_neq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_nlg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_neq_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_neq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_nlg_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nge_f16 v1, v255 -// GFX12: v_cmpx_nge_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_nlg_f16 v255.h, v2.h +// GFX12: v_cmpx_nlg_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nge_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlg_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nge_f16 v255, v2 -// GFX12: v_cmpx_nge_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_nlg_f16 v255.l, v2.l +// GFX12: v_cmpx_nlg_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_nlg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nge_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_nlg_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_ngt_f16 v1, v255 -// GFX12: v_cmpx_ngt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_nlt_f16 v1.h, v255.h +// GFX12: v_cmpx_nlt_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ngt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_ngt_f16 v255, v2 -// GFX12: v_cmpx_ngt_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_nlt_f16 v1.l, v255.l +// GFX12: v_cmpx_nlt_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_ngt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_nlt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_ngt_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_ngt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nle_f16 v1, v255 -// GFX12: v_cmpx_nle_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_nlt_f16 v255.h, v2.h +// GFX12: v_cmpx_nlt_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nle_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_nlt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nle_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nle_f16 v255, v2 -// GFX12: v_cmpx_nle_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_nlt_f16 v255.l, v2.l +// GFX12: v_cmpx_nlt_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nle_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_nlt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nle_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nle_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_nlt_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nlg_f16 v1, v255 -// GFX12: v_cmpx_nlg_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_o_f16 v1.h, v255.h +// GFX12: v_cmpx_o_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_nlg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlg_f16 v255, v2 -// GFX12: v_cmpx_nlg_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_o_f16 v1.l, v255.l +// GFX12: v_cmpx_o_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_nlg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_o_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_nlg_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_o_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_nlt_f16 v1, v255 -// GFX12: v_cmpx_nlt_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_o_f16 v255.h, v2.h +// GFX12: v_cmpx_o_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nlt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_o_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nlt_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_o_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_nlt_f16 v255, v2 -// GFX12: v_cmpx_nlt_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_o_f16 v255.l, v2.l +// GFX12: v_cmpx_o_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_nlt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_o_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_nlt_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_nlt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_o_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_o_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_o_f16 v1, v255 -// GFX12: v_cmpx_o_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_u_f16 v1.h, v255.h +// GFX12: v_cmpx_u_f16_e64 v1.h, v255.h ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_o_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_o_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16 v1.h, v255.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_o_f16 v255, v2 -// GFX12: v_cmpx_o_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_u_f16 v1.l, v255.l +// GFX12: v_cmpx_u_f16_e64 v1.l, v255.l ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00] -v_cmpx_o_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_u_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_cmpx_o_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_o_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_u_f16 v1.l, v255.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_cmpx_u_f16 v1, v255 -// GFX12: v_cmpx_u_f16_e64 v1, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00] +v_cmpx_u_f16 v255.h, v2.h +// GFX12: v_cmpx_u_f16_e64 v255.h, v2.h ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_u_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] +v_cmpx_u_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_u_f16 v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] +v_cmpx_u_f16 v255.h, v2.h quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_cmpx_u_f16 v255, v2 -// GFX12: v_cmpx_u_f16_e64 v255, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00] +v_cmpx_u_f16 v255.l, v2.l +// GFX12: v_cmpx_u_f16_e64 v255.l, v2.l ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00] -v_cmpx_u_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] +v_cmpx_u_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_cmpx_u_f16 v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_cmpx_u_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] +v_cmpx_u_f16 v255.l, v2.l quad_perm:[3,2,1,0] +// GFX12: v_cmpx_u_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt index b73c7f83c7442..4a46eaead2390 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt @@ -445,46 +445,72 @@ # GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -613,46 +639,72 @@ # GFX11: v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -917,46 +969,72 @@ # GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1221,46 +1299,72 @@ # GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1525,46 +1629,72 @@ # GFX11: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2159,46 +2289,72 @@ # GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2243,46 +2399,72 @@ # GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2327,46 +2509,72 @@ # GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2411,46 +2619,72 @@ # GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2495,46 +2729,72 @@ # GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2579,46 +2839,72 @@ # GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2663,46 +2949,72 @@ # GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2747,46 +3059,72 @@ # GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2915,46 +3253,72 @@ # GFX11: v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt index 0b7e14108848c..4c197faf52c16 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt @@ -123,16 +123,32 @@ # GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -159,16 +175,32 @@ # GFX11: v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -235,16 +267,32 @@ # GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -311,16 +359,32 @@ # GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -387,16 +451,32 @@ # GFX11: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -555,16 +635,32 @@ # GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -579,16 +675,32 @@ # GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -603,16 +715,32 @@ # GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -627,16 +755,32 @@ # GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -651,16 +795,32 @@ # GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -675,16 +835,32 @@ # GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -699,16 +875,32 @@ # GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -723,16 +915,32 @@ # GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -759,16 +967,32 @@ # GFX11: v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt index cd897944845a0..40c34708d863e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt @@ -568,10 +568,12 @@ # GFX11: v_cmpx_eq_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_f_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_f_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_f_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_f_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_f_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00] @@ -612,6 +614,14 @@ 0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_f_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_f_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_f_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00] @@ -856,10 +866,12 @@ # GFX11: v_cmpx_f_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_ge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00] @@ -900,6 +912,14 @@ 0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_ge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00] @@ -1254,10 +1274,12 @@ # GFX11: v_cmpx_ge_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_gt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_gt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00] @@ -1298,6 +1320,14 @@ 0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_gt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00] @@ -1652,10 +1682,12 @@ # GFX11: v_cmpx_gt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_le_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_le_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_le_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00] @@ -1696,6 +1728,14 @@ 0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_le_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00] @@ -2050,10 +2090,12 @@ # GFX11: v_cmpx_le_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_lg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_lg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00] @@ -2094,6 +2136,14 @@ 0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_lg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00] @@ -2856,10 +2906,12 @@ # GFX11: v_cmpx_ne_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_neq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_neq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00] @@ -2900,6 +2952,14 @@ 0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_neq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00] @@ -2982,10 +3042,12 @@ # GFX11: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_nge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00] @@ -3026,6 +3088,14 @@ 0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_nge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00] @@ -3108,10 +3178,12 @@ # GFX11: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_ngt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00] @@ -3152,6 +3224,14 @@ 0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_ngt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00] @@ -3234,10 +3314,12 @@ # GFX11: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nle_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_nle_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00] @@ -3278,6 +3360,14 @@ 0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_nle_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00] @@ -3360,10 +3450,12 @@ # GFX11: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_nlg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00] @@ -3404,6 +3496,14 @@ 0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_nlg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00] @@ -3486,10 +3586,12 @@ # GFX11: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_nlt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00] @@ -3530,6 +3632,14 @@ 0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_nlt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00] @@ -3612,10 +3722,12 @@ # GFX11: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_o_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_o_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_o_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00] @@ -3656,6 +3768,14 @@ 0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_o_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00] @@ -3738,10 +3858,12 @@ # GFX11: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_t_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_t_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_t_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_t_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_t_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00] @@ -3782,6 +3904,14 @@ 0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_t_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_t_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_t_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00] @@ -4026,10 +4156,12 @@ # GFX11: v_cmpx_t_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00 -# GFX11: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX11-REAL16: v_cmpx_u_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00 -# GFX11: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX11-REAL16: v_cmpx_u_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00 # GFX11: v_cmpx_u_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00] @@ -4070,6 +4202,14 @@ 0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00 +# GFX11-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00 +# GFX11-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00 # GFX11: v_cmpx_u_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt index 90a95138144f1..3d16d19954ab9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt @@ -649,49 +649,84 @@ # GFX11: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x00,0x7d] 0x7f,0x05,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x00,0x7d] 0x01,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x00,0x7d] 0x69,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x00,0x7d] 0x6a,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x00,0x7d] 0x6b,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x00,0x7d] 0x7b,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x00,0x7d] 0x7d,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x00,0x7d] 0x7e,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x00,0x7d] 0x7f,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x00,0x7d] 0x7c,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x00,0x7d] 0xc1,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x00,0x7d] 0xf0,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x00,0x7d] 0xfd,0x04,0x00,0x7d -# GFX11: v_cmpx_f_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x00,0x7d] +# GFX11-REAL16: v_cmpx_f_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x00,0x7d] 0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_f_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_f_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x00,0x7d +# GFX11-REAL16: v_cmpx_f_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x00,0x7d] + +0xff,0x05,0x00,0x7d +# GFX11-REAL16: v_cmpx_f_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x00,0x7d] + +0xf0,0xfe,0x00,0x7d +# GFX11-REAL16: v_cmpx_f_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x00,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x00,0x7d] + +0xfd,0x04,0x01,0x7d +# GFX11-REAL16: v_cmpx_f_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x01,0x7d] +# GFX11-FAKE16: v_cmpx_f_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x01,0x7d] + +0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_f_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_f_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x20,0x7d # GFX11: v_cmpx_f_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x20,0x7d] @@ -937,49 +972,84 @@ # GFX11: v_cmpx_f_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0c,0x7d] 0x7f,0x05,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0c,0x7d] 0x01,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0c,0x7d] 0x69,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0c,0x7d] 0x6a,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0c,0x7d] 0x6b,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0c,0x7d] 0x7b,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0c,0x7d] 0x7d,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0c,0x7d] 0x7e,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0c,0x7d] 0x7f,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0c,0x7d] 0x7c,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0c,0x7d] 0xc1,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0c,0x7d] 0xf0,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0c,0x7d] 0xfd,0x04,0x0c,0x7d -# GFX11: v_cmpx_ge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0c,0x7d] +# GFX11-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0c,0x7d] 0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x0c,0x7d +# GFX11-REAL16: v_cmpx_ge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0c,0x7d] + +0xff,0x05,0x0c,0x7d +# GFX11-REAL16: v_cmpx_ge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0c,0x7d] + +0xf0,0xfe,0x0c,0x7d +# GFX11-REAL16: v_cmpx_ge_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x0c,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x0c,0x7d] + +0xfd,0x04,0x0d,0x7d +# GFX11-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0d,0x7d] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0d,0x7d] + +0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x2c,0x7d # GFX11: v_cmpx_ge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2c,0x7d] @@ -1385,49 +1455,84 @@ # GFX11: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x08,0x7d] 0x7f,0x05,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x08,0x7d] 0x01,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x08,0x7d] 0x69,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x08,0x7d] 0x6a,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x08,0x7d] 0x6b,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x08,0x7d] 0x7b,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x08,0x7d] 0x7d,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x08,0x7d] 0x7e,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x08,0x7d] 0x7f,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x08,0x7d] 0x7c,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x08,0x7d] 0xc1,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x08,0x7d] 0xf0,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x08,0x7d] 0xfd,0x04,0x08,0x7d -# GFX11: v_cmpx_gt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x08,0x7d] +# GFX11-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x08,0x7d] 0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x08,0x7d +# GFX11-REAL16: v_cmpx_gt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x08,0x7d] + +0xff,0x05,0x08,0x7d +# GFX11-REAL16: v_cmpx_gt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x08,0x7d] + +0xf0,0xfe,0x08,0x7d +# GFX11-REAL16: v_cmpx_gt_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x08,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x08,0x7d] + +0xfd,0x04,0x09,0x7d +# GFX11-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x09,0x7d] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x09,0x7d] + +0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x28,0x7d # GFX11: v_cmpx_gt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x28,0x7d] @@ -1833,49 +1938,84 @@ # GFX11: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x06,0x7d] 0x7f,0x05,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x06,0x7d] 0x01,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x06,0x7d] 0x69,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x06,0x7d] 0x6a,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x06,0x7d] 0x6b,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x06,0x7d] 0x7b,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x06,0x7d] 0x7d,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x06,0x7d] 0x7e,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x06,0x7d] 0x7f,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x06,0x7d] 0x7c,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x06,0x7d] 0xc1,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x06,0x7d] 0xf0,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x06,0x7d] 0xfd,0x04,0x06,0x7d -# GFX11: v_cmpx_le_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x06,0x7d] +# GFX11-REAL16: v_cmpx_le_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x06,0x7d] 0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_le_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x06,0x7d +# GFX11-REAL16: v_cmpx_le_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x06,0x7d] + +0xff,0x05,0x06,0x7d +# GFX11-REAL16: v_cmpx_le_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x06,0x7d] + +0xf0,0xfe,0x06,0x7d +# GFX11-REAL16: v_cmpx_le_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x06,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x06,0x7d] + +0xfd,0x04,0x07,0x7d +# GFX11-REAL16: v_cmpx_le_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x07,0x7d] +# GFX11-FAKE16: v_cmpx_le_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x07,0x7d] + +0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x26,0x7d # GFX11: v_cmpx_le_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x26,0x7d] @@ -2281,49 +2421,84 @@ # GFX11: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0a,0x7d] 0x7f,0x05,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0a,0x7d] 0x01,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0a,0x7d] 0x69,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0a,0x7d] 0x6a,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x7d] 0x6b,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x7d] 0x7b,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x7d] 0x7d,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0a,0x7d] 0x7e,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x7d] 0x7f,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x7d] 0x7c,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0a,0x7d] 0xc1,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0a,0x7d] 0xf0,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x7d] 0xfd,0x04,0x0a,0x7d -# GFX11: v_cmpx_lg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x7d] +# GFX11-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x7d] 0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x0a,0x7d +# GFX11-REAL16: v_cmpx_lg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x7d] + +0xff,0x05,0x0a,0x7d +# GFX11-REAL16: v_cmpx_lg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x7d] + +0xf0,0xfe,0x0a,0x7d +# GFX11-REAL16: v_cmpx_lg_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x0a,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x0a,0x7d] + +0xfd,0x04,0x0b,0x7d +# GFX11-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x7d] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0b,0x7d] + +0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x2a,0x7d # GFX11: v_cmpx_lg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2a,0x7d] @@ -3212,49 +3387,84 @@ # GFX11: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1a,0x7d] 0x7f,0x05,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1a,0x7d] 0x01,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1a,0x7d] 0x69,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1a,0x7d] 0x6a,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1a,0x7d] 0x6b,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1a,0x7d] 0x7b,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1a,0x7d] 0x7d,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1a,0x7d] 0x7e,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1a,0x7d] 0x7f,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1a,0x7d] 0x7c,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1a,0x7d] 0xc1,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1a,0x7d] 0xf0,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1a,0x7d] 0xfd,0x04,0x1a,0x7d -# GFX11: v_cmpx_neq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1a,0x7d] +# GFX11-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1a,0x7d] 0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x1a,0x7d +# GFX11-REAL16: v_cmpx_neq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1a,0x7d] + +0xff,0x05,0x1a,0x7d +# GFX11-REAL16: v_cmpx_neq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1a,0x7d] + +0xf0,0xfe,0x1a,0x7d +# GFX11-REAL16: v_cmpx_neq_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1a,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x1a,0x7d] + +0xfd,0x04,0x1b,0x7d +# GFX11-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1b,0x7d] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1b,0x7d] + +0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x3a,0x7d # GFX11: v_cmpx_neq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3a,0x7d] @@ -3338,49 +3548,84 @@ # GFX11: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x12,0x7d] 0x7f,0x05,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x12,0x7d] 0x01,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x12,0x7d] 0x69,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x12,0x7d] 0x6a,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x12,0x7d] 0x6b,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x12,0x7d] 0x7b,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x12,0x7d] 0x7d,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x12,0x7d] 0x7e,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x12,0x7d] 0x7f,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x12,0x7d] 0x7c,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x12,0x7d] 0xc1,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x12,0x7d] 0xf0,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x12,0x7d] 0xfd,0x04,0x12,0x7d -# GFX11: v_cmpx_nge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x12,0x7d] +# GFX11-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x12,0x7d] 0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x12,0x7d +# GFX11-REAL16: v_cmpx_nge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x12,0x7d] + +0xff,0x05,0x12,0x7d +# GFX11-REAL16: v_cmpx_nge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x12,0x7d] + +0xf0,0xfe,0x12,0x7d +# GFX11-REAL16: v_cmpx_nge_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x12,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x12,0x7d] + +0xfd,0x04,0x13,0x7d +# GFX11-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x13,0x7d] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x13,0x7d] + +0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x32,0x7d # GFX11: v_cmpx_nge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x32,0x7d] @@ -3464,49 +3709,84 @@ # GFX11: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x16,0x7d] 0x7f,0x05,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x16,0x7d] 0x01,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x16,0x7d] 0x69,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x16,0x7d] 0x6a,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x16,0x7d] 0x6b,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x16,0x7d] 0x7b,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x16,0x7d] 0x7d,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x16,0x7d] 0x7e,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x16,0x7d] 0x7f,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x16,0x7d] 0x7c,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x16,0x7d] 0xc1,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x16,0x7d] 0xf0,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x16,0x7d] 0xfd,0x04,0x16,0x7d -# GFX11: v_cmpx_ngt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x16,0x7d] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x16,0x7d] 0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x16,0x7d +# GFX11-REAL16: v_cmpx_ngt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x16,0x7d] + +0xff,0x05,0x16,0x7d +# GFX11-REAL16: v_cmpx_ngt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x16,0x7d] + +0xf0,0xfe,0x16,0x7d +# GFX11-REAL16: v_cmpx_ngt_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x16,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x16,0x7d] + +0xfd,0x04,0x17,0x7d +# GFX11-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x17,0x7d] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x17,0x7d] + +0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x36,0x7d # GFX11: v_cmpx_ngt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x36,0x7d] @@ -3590,49 +3870,84 @@ # GFX11: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x18,0x7d] 0x7f,0x05,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x18,0x7d] 0x01,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x18,0x7d] 0x69,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x18,0x7d] 0x6a,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x18,0x7d] 0x6b,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x18,0x7d] 0x7b,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x18,0x7d] 0x7d,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x18,0x7d] 0x7e,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x18,0x7d] 0x7f,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x18,0x7d] 0x7c,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x18,0x7d] 0xc1,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x18,0x7d] 0xf0,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x18,0x7d] 0xfd,0x04,0x18,0x7d -# GFX11: v_cmpx_nle_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x18,0x7d] +# GFX11-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x18,0x7d] 0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x18,0x7d +# GFX11-REAL16: v_cmpx_nle_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x18,0x7d] + +0xff,0x05,0x18,0x7d +# GFX11-REAL16: v_cmpx_nle_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x18,0x7d] + +0xf0,0xfe,0x18,0x7d +# GFX11-REAL16: v_cmpx_nle_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x18,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x18,0x7d] + +0xfd,0x04,0x19,0x7d +# GFX11-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x19,0x7d] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x19,0x7d] + +0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x38,0x7d # GFX11: v_cmpx_nle_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x38,0x7d] @@ -3716,49 +4031,84 @@ # GFX11: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x14,0x7d] 0x7f,0x05,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x14,0x7d] 0x01,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x14,0x7d] 0x69,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x14,0x7d] 0x6a,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x14,0x7d] 0x6b,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x14,0x7d] 0x7b,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x14,0x7d] 0x7d,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x14,0x7d] 0x7e,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x14,0x7d] 0x7f,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x14,0x7d] 0x7c,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x14,0x7d] 0xc1,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x14,0x7d] 0xf0,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x14,0x7d] 0xfd,0x04,0x14,0x7d -# GFX11: v_cmpx_nlg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x14,0x7d] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x14,0x7d] 0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x14,0x7d +# GFX11-REAL16: v_cmpx_nlg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x14,0x7d] + +0xff,0x05,0x14,0x7d +# GFX11-REAL16: v_cmpx_nlg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x14,0x7d] + +0xf0,0xfe,0x14,0x7d +# GFX11-REAL16: v_cmpx_nlg_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x14,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x14,0x7d] + +0xfd,0x04,0x15,0x7d +# GFX11-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x15,0x7d] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x15,0x7d] + +0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x34,0x7d # GFX11: v_cmpx_nlg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x34,0x7d] @@ -3842,49 +4192,84 @@ # GFX11: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1c,0x7d] 0x7f,0x05,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1c,0x7d] 0x01,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1c,0x7d] 0x69,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1c,0x7d] 0x6a,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1c,0x7d] 0x6b,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1c,0x7d] 0x7b,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1c,0x7d] 0x7d,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1c,0x7d] 0x7e,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1c,0x7d] 0x7f,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1c,0x7d] 0x7c,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1c,0x7d] 0xc1,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1c,0x7d] 0xf0,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1c,0x7d] 0xfd,0x04,0x1c,0x7d -# GFX11: v_cmpx_nlt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1c,0x7d] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1c,0x7d] 0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x1c,0x7d +# GFX11-REAL16: v_cmpx_nlt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1c,0x7d] + +0xff,0x05,0x1c,0x7d +# GFX11-REAL16: v_cmpx_nlt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1c,0x7d] + +0xf0,0xfe,0x1c,0x7d +# GFX11-REAL16: v_cmpx_nlt_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1c,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x1c,0x7d] + +0xfd,0x04,0x1d,0x7d +# GFX11-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1d,0x7d] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1d,0x7d] + +0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x3c,0x7d # GFX11: v_cmpx_nlt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3c,0x7d] @@ -3968,49 +4353,84 @@ # GFX11: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0e,0x7d] 0x7f,0x05,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0e,0x7d] 0x01,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0e,0x7d] 0x69,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0e,0x7d] 0x6a,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0e,0x7d] 0x6b,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0e,0x7d] 0x7b,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0e,0x7d] 0x7d,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0e,0x7d] 0x7e,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0e,0x7d] 0x7f,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0e,0x7d] 0x7c,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0e,0x7d] 0xc1,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0e,0x7d] 0xf0,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0e,0x7d] 0xfd,0x04,0x0e,0x7d -# GFX11: v_cmpx_o_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0e,0x7d] +# GFX11-REAL16: v_cmpx_o_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0e,0x7d] 0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_o_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x0e,0x7d +# GFX11-REAL16: v_cmpx_o_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0e,0x7d] + +0xff,0x05,0x0e,0x7d +# GFX11-REAL16: v_cmpx_o_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0e,0x7d] + +0xf0,0xfe,0x0e,0x7d +# GFX11-REAL16: v_cmpx_o_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x0e,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x0e,0x7d] + +0xfd,0x04,0x0f,0x7d +# GFX11-REAL16: v_cmpx_o_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0f,0x7d] +# GFX11-FAKE16: v_cmpx_o_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0f,0x7d] + +0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x2e,0x7d # GFX11: v_cmpx_o_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2e,0x7d] @@ -4094,49 +4514,84 @@ # GFX11: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1e,0x7d] 0x7f,0x05,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1e,0x7d] 0x01,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1e,0x7d] 0x69,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1e,0x7d] 0x6a,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1e,0x7d] 0x6b,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1e,0x7d] 0x7b,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1e,0x7d] 0x7d,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1e,0x7d] 0x7e,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1e,0x7d] 0x7f,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1e,0x7d] 0x7c,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1e,0x7d] 0xc1,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1e,0x7d] 0xf0,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1e,0x7d] 0xfd,0x04,0x1e,0x7d -# GFX11: v_cmpx_t_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1e,0x7d] +# GFX11-REAL16: v_cmpx_t_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1e,0x7d] 0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_t_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_t_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x1e,0x7d +# GFX11-REAL16: v_cmpx_t_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1e,0x7d] + +0xff,0x05,0x1e,0x7d +# GFX11-REAL16: v_cmpx_t_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1e,0x7d] + +0xf0,0xfe,0x1e,0x7d +# GFX11-REAL16: v_cmpx_t_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x1e,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x1e,0x7d] + +0xfd,0x04,0x1f,0x7d +# GFX11-REAL16: v_cmpx_t_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1f,0x7d] +# GFX11-FAKE16: v_cmpx_t_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1f,0x7d] + +0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_t_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_t_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x3e,0x7d # GFX11: v_cmpx_t_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3e,0x7d] @@ -4382,49 +4837,84 @@ # GFX11: v_cmpx_t_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x10,0x7d] 0x7f,0x05,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x10,0x7d] 0x01,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x10,0x7d] 0x69,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x10,0x7d] 0x6a,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x10,0x7d] 0x6b,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x10,0x7d] 0x7b,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x10,0x7d] 0x7d,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x10,0x7d] 0x7e,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x10,0x7d] 0x7f,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x10,0x7d] 0x7c,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x10,0x7d] 0xc1,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] 0xf0,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x10,0x7d] 0xfd,0x04,0x10,0x7d -# GFX11: v_cmpx_u_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x10,0x7d] +# GFX11-REAL16: v_cmpx_u_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x10,0x7d] 0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00 -# GFX11: v_cmpx_u_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x10,0x7d +# GFX11-REAL16: v_cmpx_u_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x10,0x7d] + +0xff,0x05,0x10,0x7d +# GFX11-REAL16: v_cmpx_u_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x10,0x7d] + +0xf0,0xfe,0x10,0x7d +# GFX11-REAL16: v_cmpx_u_f16_e32 0.5, v127.l ; encoding: [0xf0,0xfe,0x10,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 0.5, v127 ; encoding: [0xf0,0xfe,0x10,0x7d] + +0xfd,0x04,0x11,0x7d +# GFX11-REAL16: v_cmpx_u_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x11,0x7d] +# GFX11-FAKE16: v_cmpx_u_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x11,0x7d] + +0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x30,0x7d # GFX11: v_cmpx_u_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x30,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt index 9b9b423a7b104..f55e646dda79b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt @@ -445,46 +445,72 @@ # GFX11: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_f_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_f_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_f_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x01,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_f_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x01,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_f_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x01,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_f_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_f_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff] @@ -613,46 +639,72 @@ # GFX11: v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ge_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_ge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff] @@ -917,46 +969,72 @@ # GFX11: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_gt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_gt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_gt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff] @@ -1221,46 +1299,72 @@ # GFX11: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_le_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_le_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_le_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff] @@ -1525,46 +1629,72 @@ # GFX11: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_lg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_lg_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_lg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff] @@ -2159,46 +2289,72 @@ # GFX11: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_neq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_neq_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_neq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff] @@ -2243,46 +2399,72 @@ # GFX11: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nge_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff] @@ -2327,46 +2509,72 @@ # GFX11: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_ngt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_ngt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_ngt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff] @@ -2411,46 +2619,72 @@ # GFX11: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nle_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nle_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nle_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff] @@ -2495,46 +2729,72 @@ # GFX11: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nlg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlg_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nlg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff] @@ -2579,46 +2839,72 @@ # GFX11: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_nlt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_nlt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_nlt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff] @@ -2663,46 +2949,72 @@ # GFX11: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_o_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_o_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_o_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff] @@ -2747,46 +3059,72 @@ # GFX11: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_t_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_t_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x1f,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_t_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_t_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff] @@ -2915,46 +3253,72 @@ # GFX11: v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff -# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff -# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff -# GFX11: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01 -# GFX11: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13 -# GFX11: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX11: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-REAL16: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01 +# GFX11-REAL16: v_cmpx_u_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01] +# GFX11-FAKE16: v_cmpx_u_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01] + +0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13 +# GFX11-REAL16: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13] +# GFX11-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30 +# GFX11-REAL16: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_cmpx_u_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff # GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt index 6ca58524688e2..72fb40a7f22a0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt @@ -127,10 +127,30 @@ # GFX11: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_f_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_f_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_f_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_f_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_f_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_f_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05] @@ -151,10 +171,30 @@ # GFX11: v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05] @@ -227,10 +267,30 @@ # GFX11: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_gt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05] @@ -303,10 +363,30 @@ # GFX11: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_le_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05] @@ -379,10 +459,30 @@ # GFX11: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_lg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05] @@ -545,10 +645,30 @@ # GFX11: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_neq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05] @@ -557,10 +677,30 @@ # GFX11: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05] @@ -569,10 +709,30 @@ # GFX11: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_ngt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05] @@ -581,10 +741,30 @@ # GFX11: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nle_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05] @@ -593,10 +773,30 @@ # GFX11: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05] @@ -605,10 +805,30 @@ # GFX11: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_nlt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05] @@ -617,10 +837,30 @@ # GFX11: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_o_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05] @@ -629,10 +869,30 @@ # GFX11: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_t_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_t_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_t_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_t_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_t_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_t_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05] @@ -653,10 +913,30 @@ # GFX11: v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05 -# GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00 -# GFX11: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05] + +0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05 +# GFX11-REAL16: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] +# GFX11-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00 +# GFX11-REAL16: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] +# GFX11-FAKE16: v_cmpx_u_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05 # GFX11: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt index d3a19ae00fa21..ab2d154e9ef9f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt @@ -567,10 +567,12 @@ # GFX12: v_cmpx_eq_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_ge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00] @@ -611,6 +613,15 @@ 0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00] @@ -967,10 +978,12 @@ # GFX12: v_cmpx_ge_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_gt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00] @@ -1011,6 +1024,15 @@ 0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_gt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00] @@ -1367,10 +1389,12 @@ # GFX12: v_cmpx_gt_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_le_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00] @@ -1411,6 +1435,15 @@ 0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_le_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00] @@ -1767,10 +1800,12 @@ # GFX12: v_cmpx_le_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_lg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00] @@ -1811,6 +1846,15 @@ 0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_lg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00] @@ -2576,10 +2620,12 @@ # GFX12: v_cmpx_ne_u64_e64 0xaf123456, vcc ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_neq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00] @@ -2620,6 +2666,15 @@ 0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_neq_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00] @@ -2702,10 +2757,12 @@ # GFX12: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_nge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00] @@ -2746,6 +2803,15 @@ 0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_nge_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00] @@ -2828,10 +2894,12 @@ # GFX12: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_ngt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00] @@ -2872,6 +2940,15 @@ 0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_ngt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00] @@ -2954,10 +3031,12 @@ # GFX12: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_nle_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00] @@ -2998,6 +3077,15 @@ 0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_nle_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00] @@ -3080,10 +3168,12 @@ # GFX12: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_nlg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00] @@ -3124,6 +3214,15 @@ 0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_nlg_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00] @@ -3206,10 +3305,12 @@ # GFX12: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_nlt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00] @@ -3250,6 +3351,15 @@ 0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_nlt_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00] @@ -3332,10 +3442,12 @@ # GFX12: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_o_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00] @@ -3376,6 +3488,15 @@ 0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] + + 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_o_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00] @@ -3458,10 +3579,12 @@ # GFX12: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf] 0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00 -# GFX12: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64 v1.l, v2.l ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00 -# GFX12: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.l ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00 # GFX12: v_cmpx_u_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00] @@ -3502,6 +3625,14 @@ 0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 # GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] +0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] + 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00 # GFX12: v_cmpx_u_f32_e64 v1, v2 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt index a1061a067d73c..f8ce4fafc0252 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt @@ -666,49 +666,123 @@ # GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_ge_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_ge_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_ge_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_ge_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_ge_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1084,49 +1158,123 @@ # GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_gt_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_gt_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_gt_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_gt_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_gt_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1502,49 +1650,123 @@ # GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_le_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_le_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_le_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_le_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_le_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -1920,49 +2142,123 @@ # GFX12: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_lg_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_lg_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_lg_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_lg_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_lg_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2830,49 +3126,123 @@ # GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_neq_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_neq_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_neq_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_neq_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_neq_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -2920,49 +3290,123 @@ # GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_nge_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_nge_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_nge_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_nge_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_nge_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -3010,49 +3454,123 @@ # GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_ngt_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_ngt_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_ngt_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_ngt_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -3100,49 +3618,123 @@ # GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_nle_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_nle_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_nle_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_nle_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_nle_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -3190,49 +3782,123 @@ # GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_nlg_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_nlg_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_nlg_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_nlg_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -3280,49 +3946,123 @@ # GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_nlt_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_nlt_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_nlt_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_nlt_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -3370,49 +4110,123 @@ # GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_o_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_o_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_o_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_o_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_o_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -3460,49 +4274,123 @@ # GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff] 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] 0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] 0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] + +0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00 +# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00] + +0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00 +# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255 ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00] + +0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 s1, s2 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 s105, s105 ; encoding: [0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 vcc_lo, ttmp15 ; encoding: [0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 vcc_hi, 0xfe0b ; encoding: [0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00 +# GFX12: v_cmpx_u_f16_e64 ttmp15, src_scc ; encoding: [0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00] + +0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00 +# GFX12: v_cmpx_u_f16_e64 m0, 0.5 ; encoding: [0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00] + +0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00 +# GFX12: v_cmpx_u_f16_e64 exec_lo, -1 ; encoding: [0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00] + +0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 |exec_hi|, null ; encoding: [0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 null, exec_lo ; encoding: [0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 -1, exec_hi ; encoding: [0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00] + +0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40 +# GFX12: v_cmpx_u_f16_e64 0.5, -m0 ; encoding: [0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40] + +0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20 +# GFX12: v_cmpx_u_f16_e64 -src_scc, |vcc_lo| ; encoding: [0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20] + +0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00 +# GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00] + +0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01] + +0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] + +0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt index 56e1ea1194a5c..83d9623c5458f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt @@ -155,22 +155,41 @@ # GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -254,19 +273,37 @@ # GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -350,19 +387,37 @@ # GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -446,19 +501,37 @@ # GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -658,19 +731,37 @@ # GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -688,22 +779,41 @@ # GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -721,19 +831,37 @@ # GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -751,19 +879,37 @@ # GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -781,19 +927,37 @@ # GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -811,19 +975,37 @@ # GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -841,19 +1023,37 @@ # GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -871,19 +1071,36 @@ # GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05] 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] 0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] 0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt index b458e613d2f20..373cc6b6e5878 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt @@ -633,49 +633,80 @@ # GFX12: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0c,0x7d] 0x7f,0x05,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0c,0x7d] 0x01,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0c,0x7d] 0x69,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0c,0x7d] 0x6a,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0c,0x7d] 0x6b,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0c,0x7d] 0x7b,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0c,0x7d] 0x7d,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0c,0x7d] 0x7e,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0c,0x7d] 0x7f,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0c,0x7d] 0x7c,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0c,0x7d] 0xc1,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0c,0x7d] 0xf0,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0c,0x7d] 0xfd,0x04,0x0c,0x7d -# GFX12: v_cmpx_ge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0c,0x7d] +# GFX12-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0c,0x7d] 0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x0c,0x7d +# GFX12-REAL16: v_cmpx_ge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0c,0x7d] + +0xff,0x05,0x0c,0x7d +# GFX12-REAL16: v_cmpx_ge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0c,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0c,0x7d] + +0xfd,0x04,0x0d,0x7d +# GFX12-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0d,0x7d] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0d,0x7d] + +0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x2c,0x7d # GFX12: v_cmpx_ge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2c,0x7d] @@ -1073,49 +1104,80 @@ # GFX12: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x08,0x7d] 0x7f,0x05,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x08,0x7d] 0x01,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x08,0x7d] 0x69,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x08,0x7d] 0x6a,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x08,0x7d] 0x6b,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x08,0x7d] 0x7b,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x08,0x7d] 0x7d,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x08,0x7d] 0x7e,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x08,0x7d] 0x7f,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x08,0x7d] 0x7c,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x08,0x7d] 0xc1,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x08,0x7d] 0xf0,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x08,0x7d] 0xfd,0x04,0x08,0x7d -# GFX12: v_cmpx_gt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x08,0x7d] +# GFX12-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x08,0x7d] 0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x08,0x7d +# GFX12-REAL16: v_cmpx_gt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x08,0x7d] + +0xff,0x05,0x08,0x7d +# GFX12-REAL16: v_cmpx_gt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x08,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x08,0x7d] + +0xfd,0x04,0x09,0x7d +# GFX12-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x09,0x7d] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x09,0x7d] + +0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x28,0x7d # GFX12: v_cmpx_gt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x28,0x7d] @@ -1513,49 +1575,80 @@ # GFX12: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x06,0x7d] 0x7f,0x05,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x06,0x7d] 0x01,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x06,0x7d] 0x69,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x06,0x7d] 0x6a,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x06,0x7d] 0x6b,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x06,0x7d] 0x7b,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x06,0x7d] 0x7d,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x06,0x7d] 0x7e,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x06,0x7d] 0x7f,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x06,0x7d] 0x7c,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x06,0x7d] 0xc1,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x06,0x7d] 0xf0,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x06,0x7d] 0xfd,0x04,0x06,0x7d -# GFX12: v_cmpx_le_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x06,0x7d] +# GFX12-REAL16: v_cmpx_le_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x06,0x7d] 0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_le_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x06,0x7d +# GFX12-REAL16: v_cmpx_le_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x06,0x7d] + +0xff,0x05,0x06,0x7d +# GFX12-REAL16: v_cmpx_le_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x06,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x06,0x7d] + +0xfd,0x04,0x07,0x7d +# GFX12-REAL16: v_cmpx_le_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x07,0x7d] +# GFX12-FAKE16: v_cmpx_le_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x07,0x7d] + +0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x26,0x7d # GFX12: v_cmpx_le_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x26,0x7d] @@ -1953,49 +2046,80 @@ # GFX12: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0a,0x7d] 0x7f,0x05,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0a,0x7d] 0x01,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0a,0x7d] 0x69,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0a,0x7d] 0x6a,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x7d] 0x6b,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x7d] 0x7b,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x7d] 0x7d,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0a,0x7d] 0x7e,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x7d] 0x7f,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x7d] 0x7c,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0a,0x7d] 0xc1,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0a,0x7d] 0xf0,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x7d] 0xfd,0x04,0x0a,0x7d -# GFX12: v_cmpx_lg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x7d] +# GFX12-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x7d] 0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x0a,0x7d +# GFX12-REAL16: v_cmpx_lg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x7d] + +0xff,0x05,0x0a,0x7d +# GFX12-REAL16: v_cmpx_lg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x7d] + +0xfd,0x04,0x0b,0x7d +# GFX12-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x7d] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0b,0x7d] + +0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x2a,0x7d # GFX12: v_cmpx_lg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2a,0x7d] @@ -2864,49 +2988,80 @@ # GFX12: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1a,0x7d] 0x7f,0x05,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1a,0x7d] 0x01,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1a,0x7d] 0x69,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1a,0x7d] 0x6a,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1a,0x7d] 0x6b,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1a,0x7d] 0x7b,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1a,0x7d] 0x7d,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1a,0x7d] 0x7e,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1a,0x7d] 0x7f,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1a,0x7d] 0x7c,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1a,0x7d] 0xc1,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1a,0x7d] 0xf0,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1a,0x7d] 0xfd,0x04,0x1a,0x7d -# GFX12: v_cmpx_neq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1a,0x7d] +# GFX12-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1a,0x7d] 0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x1a,0x7d +# GFX12-REAL16: v_cmpx_neq_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1a,0x7d] + +0xff,0x05,0x1a,0x7d +# GFX12-REAL16: v_cmpx_neq_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1a,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1a,0x7d] + +0xfd,0x04,0x1b,0x7d +# GFX12-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1b,0x7d] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1b,0x7d] + +0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x3a,0x7d # GFX12: v_cmpx_neq_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3a,0x7d] @@ -2990,49 +3145,80 @@ # GFX12: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x12,0x7d] 0x7f,0x05,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x12,0x7d] 0x01,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x12,0x7d] 0x69,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x12,0x7d] 0x6a,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x12,0x7d] 0x6b,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x12,0x7d] 0x7b,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x12,0x7d] 0x7d,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x12,0x7d] 0x7e,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x12,0x7d] 0x7f,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x12,0x7d] 0x7c,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x12,0x7d] 0xc1,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x12,0x7d] 0xf0,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x12,0x7d] 0xfd,0x04,0x12,0x7d -# GFX12: v_cmpx_nge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x12,0x7d] +# GFX12-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x12,0x7d] 0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x12,0x7d +# GFX12-REAL16: v_cmpx_nge_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x12,0x7d] + +0xff,0x05,0x12,0x7d +# GFX12-REAL16: v_cmpx_nge_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x12,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x12,0x7d] + +0xfd,0x04,0x13,0x7d +# GFX12-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x13,0x7d] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x13,0x7d] + +0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x32,0x7d # GFX12: v_cmpx_nge_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x32,0x7d] @@ -3116,49 +3302,80 @@ # GFX12: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x16,0x7d] 0x7f,0x05,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x16,0x7d] 0x01,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x16,0x7d] 0x69,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x16,0x7d] 0x6a,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x16,0x7d] 0x6b,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x16,0x7d] 0x7b,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x16,0x7d] 0x7d,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x16,0x7d] 0x7e,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x16,0x7d] 0x7f,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x16,0x7d] 0x7c,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x16,0x7d] 0xc1,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x16,0x7d] 0xf0,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x16,0x7d] 0xfd,0x04,0x16,0x7d -# GFX12: v_cmpx_ngt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x16,0x7d] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x16,0x7d] 0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x16,0x7d +# GFX12-REAL16: v_cmpx_ngt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x16,0x7d] + +0xff,0x05,0x16,0x7d +# GFX12-REAL16: v_cmpx_ngt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x16,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x16,0x7d] + +0xfd,0x04,0x17,0x7d +# GFX12-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x17,0x7d] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x17,0x7d] + +0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x36,0x7d # GFX12: v_cmpx_ngt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x36,0x7d] @@ -3242,49 +3459,80 @@ # GFX12: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x18,0x7d] 0x7f,0x05,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x18,0x7d] 0x01,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x18,0x7d] 0x69,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x18,0x7d] 0x6a,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x18,0x7d] 0x6b,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x18,0x7d] 0x7b,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x18,0x7d] 0x7d,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x18,0x7d] 0x7e,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x18,0x7d] 0x7f,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x18,0x7d] 0x7c,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x18,0x7d] 0xc1,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x18,0x7d] 0xf0,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x18,0x7d] 0xfd,0x04,0x18,0x7d -# GFX12: v_cmpx_nle_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x18,0x7d] +# GFX12-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x18,0x7d] 0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x18,0x7d +# GFX12-REAL16: v_cmpx_nle_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x18,0x7d] + +0xff,0x05,0x18,0x7d +# GFX12-REAL16: v_cmpx_nle_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x18,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x18,0x7d] + +0xfd,0x04,0x19,0x7d +# GFX12-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x19,0x7d] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x19,0x7d] + +0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x38,0x7d # GFX12: v_cmpx_nle_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x38,0x7d] @@ -3368,49 +3616,80 @@ # GFX12: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x14,0x7d] 0x7f,0x05,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x14,0x7d] 0x01,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x14,0x7d] 0x69,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x14,0x7d] 0x6a,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x14,0x7d] 0x6b,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x14,0x7d] 0x7b,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x14,0x7d] 0x7d,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x14,0x7d] 0x7e,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x14,0x7d] 0x7f,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x14,0x7d] 0x7c,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x14,0x7d] 0xc1,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x14,0x7d] 0xf0,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x14,0x7d] 0xfd,0x04,0x14,0x7d -# GFX12: v_cmpx_nlg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x14,0x7d] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x14,0x7d] 0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x14,0x7d +# GFX12-REAL16: v_cmpx_nlg_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x14,0x7d] + +0xff,0x05,0x14,0x7d +# GFX12-REAL16: v_cmpx_nlg_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x14,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x14,0x7d] + +0xfd,0x04,0x15,0x7d +# GFX12-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x15,0x7d] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x15,0x7d] + +0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x34,0x7d # GFX12: v_cmpx_nlg_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x34,0x7d] @@ -3494,49 +3773,80 @@ # GFX12: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x1c,0x7d] 0x7f,0x05,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x1c,0x7d] 0x01,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x1c,0x7d] 0x69,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x1c,0x7d] 0x6a,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x1c,0x7d] 0x6b,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x1c,0x7d] 0x7b,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x1c,0x7d] 0x7d,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x1c,0x7d] 0x7e,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x1c,0x7d] 0x7f,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x1c,0x7d] 0x7c,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x1c,0x7d] 0xc1,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x1c,0x7d] 0xf0,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x1c,0x7d] 0xfd,0x04,0x1c,0x7d -# GFX12: v_cmpx_nlt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1c,0x7d] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x1c,0x7d] 0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x1c,0x7d +# GFX12-REAL16: v_cmpx_nlt_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1c,0x7d] + +0xff,0x05,0x1c,0x7d +# GFX12-REAL16: v_cmpx_nlt_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x1c,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1c,0x7d] + +0xfd,0x04,0x1d,0x7d +# GFX12-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x1d,0x7d] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1d,0x7d] + +0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x3c,0x7d # GFX12: v_cmpx_nlt_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x3c,0x7d] @@ -3620,49 +3930,80 @@ # GFX12: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x0e,0x7d] 0x7f,0x05,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x0e,0x7d] 0x01,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x0e,0x7d] 0x69,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x0e,0x7d] 0x6a,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x0e,0x7d] 0x6b,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x0e,0x7d] 0x7b,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x0e,0x7d] 0x7d,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x0e,0x7d] 0x7e,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x0e,0x7d] 0x7f,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x0e,0x7d] 0x7c,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x0e,0x7d] 0xc1,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x0e,0x7d] 0xf0,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x0e,0x7d] 0xfd,0x04,0x0e,0x7d -# GFX12: v_cmpx_o_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0e,0x7d] +# GFX12-REAL16: v_cmpx_o_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x0e,0x7d] 0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_o_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x0e,0x7d +# GFX12-REAL16: v_cmpx_o_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0e,0x7d] + +0xff,0x05,0x0e,0x7d +# GFX12-REAL16: v_cmpx_o_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x0e,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0e,0x7d] + +0xfd,0x04,0x0f,0x7d +# GFX12-REAL16: v_cmpx_o_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x0f,0x7d] +# GFX12-FAKE16: v_cmpx_o_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0f,0x7d] + +0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x2e,0x7d # GFX12: v_cmpx_o_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x2e,0x7d] @@ -3746,49 +4087,80 @@ # GFX12: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf] 0x01,0x05,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 v1.l, v2.l ; encoding: [0x01,0x05,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 v1, v2 ; encoding: [0x01,0x05,0x10,0x7d] 0x7f,0x05,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 v127.l, v2.l ; encoding: [0x7f,0x05,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 v127, v2 ; encoding: [0x7f,0x05,0x10,0x7d] 0x01,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 s1, v2.l ; encoding: [0x01,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 s1, v2 ; encoding: [0x01,0x04,0x10,0x7d] 0x69,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 s105, v2.l ; encoding: [0x69,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 s105, v2 ; encoding: [0x69,0x04,0x10,0x7d] 0x6a,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 vcc_lo, v2.l ; encoding: [0x6a,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 vcc_lo, v2 ; encoding: [0x6a,0x04,0x10,0x7d] 0x6b,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 vcc_hi, v2.l ; encoding: [0x6b,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 vcc_hi, v2 ; encoding: [0x6b,0x04,0x10,0x7d] 0x7b,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 ttmp15, v2.l ; encoding: [0x7b,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 ttmp15, v2 ; encoding: [0x7b,0x04,0x10,0x7d] 0x7d,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 m0, v2.l ; encoding: [0x7d,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 m0, v2 ; encoding: [0x7d,0x04,0x10,0x7d] 0x7e,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 exec_lo, v2.l ; encoding: [0x7e,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 exec_lo, v2 ; encoding: [0x7e,0x04,0x10,0x7d] 0x7f,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 exec_hi, v2.l ; encoding: [0x7f,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 exec_hi, v2 ; encoding: [0x7f,0x04,0x10,0x7d] 0x7c,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 null, v2.l ; encoding: [0x7c,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 null, v2 ; encoding: [0x7c,0x04,0x10,0x7d] 0xc1,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 -1, v2.l ; encoding: [0xc1,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] 0xf0,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 0.5, v2.l ; encoding: [0xf0,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 0.5, v2 ; encoding: [0xf0,0x04,0x10,0x7d] 0xfd,0x04,0x10,0x7d -# GFX12: v_cmpx_u_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x10,0x7d] +# GFX12-REAL16: v_cmpx_u_f16_e32 src_scc, v2.l ; encoding: [0xfd,0x04,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 src_scc, v2 ; encoding: [0xfd,0x04,0x10,0x7d] 0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00 -# GFX12: v_cmpx_u_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.l ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v127 ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00] + +0x81,0x05,0x10,0x7d +# GFX12-REAL16: v_cmpx_u_f16_e32 v1.h, v2.l ; encoding: [0x81,0x05,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x10,0x7d] + +0xff,0x05,0x10,0x7d +# GFX12-REAL16: v_cmpx_u_f16_e32 v127.h, v2.l ; encoding: [0xff,0x05,0x10,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x10,0x7d] + +0xfd,0x04,0x11,0x7d +# GFX12-REAL16: v_cmpx_u_f16_e32 src_scc, v2.h ; encoding: [0xfd,0x04,0x11,0x7d] +# GFX12-FAKE16: v_cmpx_u_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x11,0x7d] + +0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.h ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x30,0x7d # GFX12: v_cmpx_u_f32_e32 v1, v2 ; encoding: [0x01,0x05,0x30,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt index 0289ed58f07e3..61655c0aa2001 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt @@ -429,46 +429,68 @@ # GFX12: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_ge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff] @@ -725,46 +747,68 @@ # GFX12: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_gt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff] @@ -1021,46 +1065,68 @@ # GFX12: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_le_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff] @@ -1317,46 +1383,68 @@ # GFX12: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_lg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff] @@ -1931,46 +2019,68 @@ # GFX12: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30] 0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_neq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff] @@ -2015,46 +2125,68 @@ # GFX12: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff] @@ -2099,46 +2231,68 @@ # GFX12: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_ngt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff] @@ -2183,46 +2337,68 @@ # GFX12: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nle_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff] @@ -2267,46 +2443,68 @@ # GFX12: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nlg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff] @@ -2351,46 +2549,68 @@ # GFX12: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_nlt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff] @@ -2435,46 +2655,68 @@ # GFX12: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_o_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff] @@ -2519,46 +2761,68 @@ # GFX12: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff -# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff -# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff -# GFX12: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff] 0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01 -# GFX12: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13 -# GFX12: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13] 0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30 -# GFX12: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-REAL16: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30] + +0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13 +# GFX12-REAL16: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13] +# GFX12-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13] + +0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30 +# GFX12-REAL16: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_cmpx_u_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30] 0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff # GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt index e55d6a4c77c2a..bcd75b7bfa087 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt @@ -93,10 +93,20 @@ # GFX12: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05] @@ -149,10 +159,20 @@ # GFX12: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_gt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05] @@ -205,10 +225,20 @@ # GFX12: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_le_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05] @@ -261,10 +291,20 @@ # GFX12: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_lg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05] @@ -383,10 +423,20 @@ # GFX12: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_neq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05] @@ -395,10 +445,20 @@ # GFX12: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05] @@ -407,10 +467,20 @@ # GFX12: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_ngt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05] @@ -419,10 +489,20 @@ # GFX12: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nle_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05] @@ -431,10 +511,20 @@ # GFX12: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05] @@ -443,10 +533,20 @@ # GFX12: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_nlt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05] @@ -455,10 +555,20 @@ # GFX12: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_o_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05] @@ -467,10 +577,20 @@ # GFX12: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05 -# GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05] 0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00 -# GFX12: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00] + +0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05 +# GFX12-REAL16: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] +# GFX12-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05] + +0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00 +# GFX12-REAL16: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] +# GFX12-FAKE16: v_cmpx_u_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00] 0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05 # GFX12: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05] From 5aafc6d58f3405662902cee006be11e599801b88 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Mon, 27 Jan 2025 16:18:47 +0100 Subject: [PATCH 205/432] [Polly] Fix typos discovered by codespell (#124545) Patch created using the following command line: ```bash codespell polly --skip="*.pdf,polly/lib/External/*" --write-changes \ --ignore-words-list=couter,createor,distribues,doble,identty,indention,indx,olt,ore,padd,sais,te,theses ``` --- polly/docs/Architecture.rst | 2 +- polly/docs/doxygen.cfg.in | 4 ++-- polly/include/polly/CodeGen/RuntimeDebugBuilder.h | 2 +- polly/include/polly/DependenceInfo.h | 4 ++-- polly/include/polly/ScopInfo.h | 2 +- polly/include/polly/Support/SCEVAffinator.h | 2 +- polly/include/polly/Support/ScopHelper.h | 4 ++-- polly/lib/Analysis/ScopBuilder.cpp | 2 +- polly/lib/Analysis/ScopDetectionDiagnostic.cpp | 2 +- polly/lib/CodeGen/IslNodeBuilder.cpp | 2 +- polly/lib/CodeGen/Utils.cpp | 2 +- polly/lib/Support/SCEVAffinator.cpp | 2 +- polly/lib/Support/ScopHelper.cpp | 2 +- polly/lib/Transform/DeLICM.cpp | 2 +- polly/lib/Transform/ManualOptimizer.cpp | 2 +- polly/lib/Transform/MatmulOptimizer.cpp | 2 +- polly/lib/Transform/ScheduleOptimizer.cpp | 2 +- polly/lib/Transform/ScheduleTreeTransform.cpp | 8 ++++---- .../CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll | 2 +- polly/test/CodeGen/multiple-codegens.ll | 2 +- polly/test/CodeGen/multiple-scops-in-a-row.ll | 2 +- polly/test/CodeGen/reduction_2.ll | 2 +- polly/test/CodeGen/scalar-store-from-same-bb.ll | 2 +- polly/test/CodeGen/test-invalid-operands-for-select.ll | 2 +- polly/test/DeLICM/load-in-cond-inf-loop.ll | 2 +- polly/test/DeLICM/pr41656.ll | 2 +- polly/test/DeLICM/pr48783.ll | 2 +- polly/test/DeLICM/reject_outofquota.ll | 2 +- .../reduction_modulo_schedule_multiple_dimensions_2.ll | 2 +- .../reduction_modulo_schedule_multiple_dimensions_3.ll | 2 +- .../reduction_modulo_schedule_multiple_dimensions_4.ll | 2 +- .../pattern-matching-based-opts-after-delicm.ll | 2 +- .../pattern-matching-based-opts-after-delicm_2.ll | 2 +- polly/test/ScopDetect/scev_remove_max.ll | 2 +- .../test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll | 2 +- .../test/ScopInfo/NonAffine/non_affine_loop_used_later.ll | 2 +- .../test/ScopInfo/allow-all-parameters-dereferencable.ll | 2 +- .../invariant_same_loop_bound_multiple_times-1.ll | 2 +- .../invariant_same_loop_bound_multiple_times-2.ll | 2 +- polly/test/ScopInfo/multidim_gep_pointercast2.ll | 2 +- polly/test/ScopInfo/multidim_many_references.ll | 2 +- polly/test/ScopInfo/scalar_to_array.ll | 2 +- polly/test/ScopInfo/zero_ext_of_truncate.ll | 2 +- polly/test/create_ll.sh | 2 +- polly/utils/pyscop/isl.py | 4 ++-- polly/www/changelog.html | 2 +- polly/www/get_started.html | 2 +- polly/www/index.html | 8 ++++---- polly/www/projects.html | 2 +- polly/www/publications.html | 2 +- 50 files changed, 60 insertions(+), 60 deletions(-) diff --git a/polly/docs/Architecture.rst b/polly/docs/Architecture.rst index 645d6522e8694..506e0ff8d26be 100644 --- a/polly/docs/Architecture.rst +++ b/polly/docs/Architecture.rst @@ -27,7 +27,7 @@ executed in the so-called **Inliner cycle**, This is again a set of **Scalar Simplification** passes, a set of **Simple Loop Optimizations**, and the **Inliner** itself. Even though these passes make up the majority of the LLVM pass pipeline, the primary goal of these passes is still canonicalization -without loosing semantic information that complicates later analysis. As part of +without losing semantic information that complicates later analysis. As part of the inliner cycle, the LLVM inliner step-by-step tries to inline functions, runs canonicalization passes to exploit newly exposed simplification opportunities, and then tries to inline the further simplified functions. Some simple loop diff --git a/polly/docs/doxygen.cfg.in b/polly/docs/doxygen.cfg.in index befe6e138ec27..a553fc8154fb1 100644 --- a/polly/docs/doxygen.cfg.in +++ b/polly/docs/doxygen.cfg.in @@ -1066,7 +1066,7 @@ HTML_STYLESHEET = # defined cascading style sheet that is included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefor more robust against future updates. +# standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet file to the output directory. For an example # see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1950,7 +1950,7 @@ PREDEFINED = EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will -# remove all refrences to function-like macros that are alone on a line, have an +# remove all references to function-like macros that are alone on a line, have an # all uppercase name, and do not end with a semicolon. Such function macros are # typically used for boiler-plate code, and will confuse the parser if not # removed. diff --git a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h index 46193f9ccaf2c..2e5349d9edd6e 100644 --- a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h +++ b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h @@ -118,7 +118,7 @@ struct RuntimeDebugBuilder { /// Call fflush /// - /// @parma Builder The builder used to insert the code. + /// @param Builder The builder used to insert the code. static void createFlush(PollyIRBuilder &Builder); }; } // namespace polly diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h index 7526a294c6baf..d562ad80592f2 100644 --- a/polly/include/polly/DependenceInfo.h +++ b/polly/include/polly/DependenceInfo.h @@ -211,7 +211,7 @@ struct DependenceAnalysis final : public AnalysisInfoMixin { /// Invalidate the dependence information and recompute it when needed /// again. - /// May be required when the underlaying Scop was changed in a way that + /// May be required when the underlying Scop was changed in a way that /// would add new dependencies (e.g. between new statement instances /// insierted into the SCoP) or intentionally breaks existing ones. It is /// not required when updating the schedule that conforms the existing @@ -251,7 +251,7 @@ class DependenceInfo final : public ScopPass { const Dependences &recomputeDependences(Dependences::AnalysisLevel Level); /// Invalidate the dependence information and recompute it when needed again. - /// May be required when the underlaying Scop was changed in a way that would + /// May be required when the underlying Scop was changed in a way that would /// add new dependencies (e.g. between new statement instances insierted into /// the SCoP) or intentionally breaks existing ones. It is not required when /// updating the schedule that conforms the existing dependencies. diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h index 974de817e72db..ab0f81dd2836d 100644 --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -1494,7 +1494,7 @@ class ScopStmt final { /// @param Access The access to add. /// @param Prepend If true, will add @p Access before all other instructions /// (instead of appending it). - void addAccess(MemoryAccess *Access, bool Preprend = false); + void addAccess(MemoryAccess *Access, bool Prepend = false); /// Remove a MemoryAccess from this statement. /// diff --git a/polly/include/polly/Support/SCEVAffinator.h b/polly/include/polly/Support/SCEVAffinator.h index 63a341a01309b..faacfd8ba0e69 100644 --- a/polly/include/polly/Support/SCEVAffinator.h +++ b/polly/include/polly/Support/SCEVAffinator.h @@ -50,7 +50,7 @@ class SCEVAffinator final : public llvm::SCEVVisitor { /// Check an AddRec for the loop @p L is cached. bool hasNSWAddRecForLoop(llvm::Loop *L) const; - /// Return the LoopInfo used by thi object. + /// Return the LoopInfo used by the object. llvm::LoopInfo *getLI() const { return &LI; } private: diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h index 13852ecb18ee7..7818f67b505fd 100644 --- a/polly/include/polly/Support/ScopHelper.h +++ b/polly/include/polly/Support/ScopHelper.h @@ -83,7 +83,7 @@ using RecordedAssumptionsTy = llvm::SmallVector; /// /// This function will add the assumption to the RecordedAssumptions. This /// collection will be added (@see addAssumption) to the assumed context once -/// all paramaters are known and the context is fully built. +/// all parameters are known and the context is fully built. /// /// @param RecordedAssumption container which keeps all recorded assumptions. /// @param Kind The assumption kind describing the underlying cause. @@ -132,7 +132,7 @@ using BoxedLoopsSetTy = llvm::SetVector; /// isNull(), isInstruction(), isLoad(), isStore(), ..., isMemTransferInst(), /// operator bool(), operator!() /// -/// The functions isa, cast, cast_or_null, dyn_cast are modeled te resemble +/// The functions isa, cast, cast_or_null, dyn_cast are modeled to resemble /// those from llvm/Support/Casting.h. Partial template function specialization /// is currently not supported in C++ such that those cannot be used directly. /// (llvm::isa could, but then llvm:cast etc. would not have the expected diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 82fa9e11550f2..76c9b4775784e 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -2522,7 +2522,7 @@ combineReductionType(MemoryAccess::ReductionType RT0, return MemoryAccess::RT_NONE; } -/// True if @p AllAccs intersects with @p MemAccs execpt @p LoadMA and @p +/// True if @p AllAccs intersects with @p MemAccs except @p LoadMA and @p /// StoreMA bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA, MemoryAccess *StoreMA, isl::set Domain, diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp index 14a6f074454f7..f810d543f1ac0 100644 --- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp +++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp @@ -71,7 +71,7 @@ static Statistic RejectStatistics[] = { SCOP_STAT(LoopOnlySomeLatches, "Not all loop latches in scop"), SCOP_STAT(FuncCall, "Function call with side effects"), SCOP_STAT(NonSimpleMemoryAccess, - "Compilated access semantics (volatile or atomic)"), + "Complicated access semantics (volatile or atomic)"), SCOP_STAT(Alias, "Base address aliasing"), SCOP_STAT(Other, ""), SCOP_STAT(IntToPtr, "Integer to pointer conversions"), diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index 739bd63a5eb80..bbe7bc42cd833 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -1159,7 +1159,7 @@ bool IslNodeBuilder::preloadInvariantEquivClass( // For an equivalence class of invariant loads we pre-load the representing // element with the unified execution context. However, we have to map all // elements of the class to the one preloaded load as they are referenced - // during the code generation and therefor need to be mapped. + // during the code generation and therefore need to be mapped. const MemoryAccessList &MAs = IAClass.InvariantAccesses; if (MAs.empty()) return true; diff --git a/polly/lib/CodeGen/Utils.cpp b/polly/lib/CodeGen/Utils.cpp index 3afb2e580889b..e95705616022f 100644 --- a/polly/lib/CodeGen/Utils.cpp +++ b/polly/lib/CodeGen/Utils.cpp @@ -46,7 +46,7 @@ static BasicBlock *splitEdge(BasicBlock *Prev, BasicBlock *Succ, // llvm::SplitCriticalEdge is more efficient than // llvm::SplitBlockPredecessors, which is more general. In the future we might // either modify llvm::SplitCriticalEdge to allow skipping the critical edge - // check; or Copy&Pase it here. + // check; or Copy&Paste it here. BasicBlock *MiddleBlock = SplitBlockPredecessors( Succ, ArrayRef(Prev), Suffix, DT, LI); diff --git a/polly/lib/Support/SCEVAffinator.cpp b/polly/lib/Support/SCEVAffinator.cpp index ce4467f082ba4..87e0fc056ca4b 100644 --- a/polly/lib/Support/SCEVAffinator.cpp +++ b/polly/lib/Support/SCEVAffinator.cpp @@ -502,7 +502,7 @@ PWACtx SCEVAffinator::visitUDivExpr(const SCEVUDivExpr *Expr) { } // TODO: One can represent the dividend as piece-wise function to be more - // precise but therefor a heuristic is needed. + // precise but therefore a heuristic is needed. // Assume a non-negative dividend. takeNonNegativeAssumption(DividendPWAC, RecordedAssumptions); diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp index bece390d31dc6..fbf891be0a1e7 100644 --- a/polly/lib/Support/ScopHelper.cpp +++ b/polly/lib/Support/ScopHelper.cpp @@ -456,7 +456,7 @@ struct ScopExpander final : SCEVVisitor { // FIXME: This emits a SCEV for GenSE (since GenLRepl will refer to the // induction variable of a generated loop), so we should not use SCEVVisitor - // with it. Howver, it still contains references to the SCoP region. + // with it. However, it still contains references to the SCoP region. return visit(Evaluated); } ///} diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp index b7e464e6739c6..9a9768afe113e 100644 --- a/polly/lib/Transform/DeLICM.cpp +++ b/polly/lib/Transform/DeLICM.cpp @@ -1290,7 +1290,7 @@ class DeLICMImpl final : public ZoneAlgorithm { continue; } - // Check for more than one element acces per statement instance. + // Check for more than one element access per statement instance. // Currently we expect write accesses to be functional, eg. disallow // // { Stmt[0] -> [i] : 0 <= i < 2 } diff --git a/polly/lib/Transform/ManualOptimizer.cpp b/polly/lib/Transform/ManualOptimizer.cpp index 0e330f207fbc4..98ab9819e5b1f 100644 --- a/polly/lib/Transform/ManualOptimizer.cpp +++ b/polly/lib/Transform/ManualOptimizer.cpp @@ -149,7 +149,7 @@ class SearchTransformVisitor final // transformed in innermost-first order. isl::schedule Result; - /// Check wether a schedule after a transformation is legal. Return the old + /// Check whether a schedule after a transformation is legal. Return the old /// schedule without the transformation. isl::schedule checkDependencyViolation(llvm::MDNode *LoopMD, llvm::Value *CodeRegion, diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp index ff1683b2d63c5..01d431a97e7db 100644 --- a/polly/lib/Transform/MatmulOptimizer.cpp +++ b/polly/lib/Transform/MatmulOptimizer.cpp @@ -1759,7 +1759,7 @@ static bool isTCPattern(isl::schedule_node Node, const Dependences *D, // // For example, this covers the matrix multiplication pattern after a full // run of -polly-optree and -polly-delicm, where the write access is not - // through the original memory access, but trough a PHI node that was + // through the original memory access, but through a PHI node that was // delicmed. Subsequently, such band nodes will be replaced by a single band // node. // diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 55d51982d90e7..070700a64a168 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -771,7 +771,7 @@ static void runIslScheduleOptimizer( return; } - // Apply ISL's algorithm only if not overriden by the user. Note that + // Apply ISL's algorithm only if not overridden by the user. Note that // post-rescheduling optimizations (tiling, pattern-based, prevectorization) // rely on the coincidence/permutable annotations on schedule tree bands that // are added by the rescheduling analyzer. Therefore, disabling the diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp index f0684de825d27..3f3630027e6e3 100644 --- a/polly/lib/Transform/ScheduleTreeTransform.cpp +++ b/polly/lib/Transform/ScheduleTreeTransform.cpp @@ -572,13 +572,13 @@ class BandCollapseRewriter final isl::schedule_node_band Band = RootBand; isl::ctx Ctx = Band.ctx(); - // Do not merge permutable band to avoid loosing the permutability property. + // Do not merge permutable band to avoid losing the permutability property. // Cannot collapse even two permutable loops, they might be permutable // individually, but not necassarily across. if (unsignedFromIslSize(Band.n_member()) > 1u && Band.permutable()) return getBase().visitBand(Band); - // Find collapsable bands. + // Find collapsible bands. SmallVector Nest; int NumTotalLoops = 0; isl::schedule_node Body; @@ -884,10 +884,10 @@ class GreedyFusionRewriter final collectPotentiallyFusableBands(Child, Bands, Child); } - // Direct children that had at least one of its decendants fused. + // Direct children that had at least one of its descendants fused. SmallDenseSet ChangedDirectChildren; - // Fuse neigboring bands until reaching the end of candidates. + // Fuse neighboring bands until reaching the end of candidates. int i = 0; while (i + 1 < (int)Bands.size()) { isl::schedule Fused = diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll index 28531244421d1..eb7de01ba862c 100644 --- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll +++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll @@ -1,4 +1,4 @@ -; This test checks that we do not accidently mutate the debug info when +; This test checks that we do not accidentally mutate the debug info when ; inserting loop parallel metadata. ; RUN: opt %loadNPMPolly < %s -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s ; CHECK-NOT: !7 = !{!7} diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll index 2fa974e66df50..a63f8a615ff9e 100644 --- a/polly/test/CodeGen/multiple-codegens.ll +++ b/polly/test/CodeGen/multiple-codegens.ll @@ -6,7 +6,7 @@ ; RegionPassManager. -polly-codegen must not reuse the -polly-ast analysis the ; was created for the first -polly-scops pass. ; The current solution is that only the first -polly-codegen is allowed to -; generate code, the second detects it is re-using an IslAst that belongs to a +; generate code, the second detects it is reusing an IslAst that belongs to a ; different ScopInfo. ; ; int a, b, c; diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll index b81ba04e36463..effae223c152a 100644 --- a/polly/test/CodeGen/multiple-scops-in-a-row.ll +++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll @@ -6,7 +6,7 @@ ; We explicitly check here that the second scop is not code generated. Later ; improvements may make this possible (e.g., Polly gaining support for -; parameteric conditional expressions or a changed code generation order). +; parametric conditional expressions or a changed code generation order). ; However, in case this happens, we want to ensure this test case is been ; reasoned about and updated accordingly. diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll index 4aa306775e781..060a1866870e4 100644 --- a/polly/test/CodeGen/reduction_2.ll +++ b/polly/test/CodeGen/reduction_2.ll @@ -88,7 +88,7 @@ if.end: ; preds = %if.then, %for.end declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i32, i1) nounwind ; This is a negative test. We can prove that RED[0] in the conditional after -; the loop is dereferencable and consequently expand the SCoP from the +; the loop is dereferenceable and consequently expand the SCoP from the ; loop to include the conditional. However, during SCoP generation we realize ; that, while RED[0] is invariant, it is written to as part of the same scop ; and can consequently not be hoisted. Hence, we invalidate the scop. diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll index 3f232da37e4cd..0c1164b245a43 100644 --- a/polly/test/CodeGen/scalar-store-from-same-bb.ll +++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll @@ -2,7 +2,7 @@ ; RUN: -passes=polly-codegen -S < %s | FileCheck %s ; This test ensures that the expression N + 1 that is stored in the phi-node -; alloca, is directly computed and not incorrectly transfered through memory. +; alloca, is directly computed and not incorrectly transferred through memory. ; CHECK: store i64 [[REG:%.*]], ptr %res.phiops ; CHECK: [[REG]] = add i64 %N, 1 diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll index 9f5013cf1bb16..fdc98fbb4d9e7 100644 --- a/polly/test/CodeGen/test-invalid-operands-for-select.ll +++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll @@ -2,7 +2,7 @@ ; ; Check that we do not crash as described here: http://llvm.org/PR21167 ; -; In case the pieceweise affine function used to create an isl_ast_expr +; In case the piecewise affine function used to create an isl_ast_expr ; had empty cases (e.g., with contradicting constraints on the ; parameters), it was possible that the condition of the isl_ast_expr ; select was not a comparison but a constant (thus of type i64). diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll index f6e23110aa6f1..a78a4691bb0d5 100644 --- a/polly/test/DeLICM/load-in-cond-inf-loop.ll +++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll @@ -1,6 +1,6 @@ ; RUN: opt %loadNPMPolly '-passes=print' -disable-output < %s | FileCheck %s -; When %b is 0, %for.body13 is an infite loop. In this case the loaded +; When %b is 0, %for.body13 is an infinite loop. In this case the loaded ; value %1 is not used anywhere. ; This is a problem when DeLICM tries to map %1 to %arrayidx16 because ; %1 has no corresponding when %b == 0 and therefore hat no location diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll index d7cfde35a6e80..2a92503809a24 100644 --- a/polly/test/DeLICM/pr41656.ll +++ b/polly/test/DeLICM/pr41656.ll @@ -4,7 +4,7 @@ ; ; This test case has an InvalidContext such that part of the predecessors ; of for.body.us.i lie within the invalid context. This causes a -; consistency check withing the invalid context of PR41656 to fail. +; consistency check within the invalid context of PR41656 to fail. ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll index e3c3eb6a19ccf..deba8bfcc5daf 100644 --- a/polly/test/DeLICM/pr48783.ll +++ b/polly/test/DeLICM/pr48783.ll @@ -4,7 +4,7 @@ ; ; PHI predecessors of statement instances can only be reliably derived in defined behaviour situations. In this case, the inner loop's counter would overflow when its upper bound (%call24) is lower than its lower bound (2). However, due to the nsw flag, this would be undefined behavior and therefore not added to any runtime-check context, but to the defined-behaviour context. ; -; Dereived from test case pr41656.ll +; Derived from test case pr41656.ll ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll index 820679a5349d2..9bc6bf1f23733 100644 --- a/polly/test/DeLICM/reject_outofquota.ll +++ b/polly/test/DeLICM/reject_outofquota.ll @@ -66,7 +66,7 @@ return: ; CHECK: maximal number of operations exceeded during zone analysis ; Check that even if the quota was exceeded in DeLICM, DependenceInfo is still -; successfull since it uses a different operations counter. +; successful since it uses a different operations counter. ; ; DEP: RAW dependences: ; DEP-NOT: n/a diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll index 46b2559c6e0b1..d7f9029fd347a 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll @@ -1,6 +1,6 @@ ; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; -; Verify that the outer dimension doesnt't carry reduction dependences +; Verify that the outer dimension doesn't carry reduction dependences ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll index 6f40ee90fef53..f18060a2e20a8 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll @@ -1,6 +1,6 @@ ; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; -; Verify that the outer dimension doesnt't carry reduction dependences +; Verify that the outer dimension doesn't carry reduction dependences ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll index f82b9569a88b1..8e2a590c5f57c 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll @@ -1,6 +1,6 @@ ; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; -; Verify that the outer dimension doesnt't carry reduction dependences +; Verify that the outer dimension doesn't carry reduction dependences ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll index 8228a5c08f598..6e9ade869ec6c 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -7,7 +7,7 @@ ; Check that the pattern matching detects the matrix multiplication pattern ; after a full run of -polly-optree and -polly-delicm, where the write access -; is not through the original memory access, but trough a PHI node that was +; is not through the original memory access, but through a PHI node that was ; delicmed. This test covers the polybench 2mm and 3mm cases. ; ; This test case generates the following schedule, which contains filters: diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll index 4bda7584f5962..4ef0605a0ba75 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll @@ -5,7 +5,7 @@ ; ; Check that the pattern matching detects the tensor contraction pattern ; after a full run of -polly-delicm. This test case generates the following -; schedule, which contans two band nodes. Without DeLICM two statement are +; schedule, which contains two band nodes. Without DeLICM two statement are ; generated. ; ; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll index caf55bf87a667..f76c832ff08f5 100644 --- a/polly/test/ScopDetect/scev_remove_max.ll +++ b/polly/test/ScopDetect/scev_remove_max.ll @@ -1,6 +1,6 @@ ; RUN: opt %loadNPMPolly '-passes=print' < %s -; This test case helps to determine wether SCEVRemoveMax::remove produces +; This test case helps to determine whether SCEVRemoveMax::remove produces ; an infinite loop and a segmentation fault, if it processes, for example, ; '((-1 + (-1 * %b1)) umax {(-1 + (-1 * %yStart)),+,-1}<%.preheader>)'. ; diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll index 92028093f70bb..3743bfae9fcaf 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll @@ -1,7 +1,7 @@ ; XFAIL: * ; The test case stopped making sense after r310940 that added infinite loops to -; the PostDominatorTree. Infinite loops are postdominated ony by the virtual +; the PostDominatorTree. Infinite loops are postdominated only by the virtual ; root, which causes them not to appear in regions in ScopDetection anymore. ; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print' -disable-output < %s 2>&1 | FileCheck %s diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll index 79b61eca258f2..63ff354d7e5f7 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll @@ -1,7 +1,7 @@ ; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=print,print' -disable-output < %s 2>&1 | FileCheck %s ; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print,print' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; -; Verify that we over approximate the read acces of A[j] in the last statement as j is +; Verify that we over approximate the read access of A[j] in the last statement as j is ; computed in a non-affine loop we do not model. ; ; CHECK: Function: f diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll index cb06e352da658..70c3c56fb3112 100644 --- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll +++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll @@ -28,7 +28,7 @@ ; CODE-RTC-NEXT: br i1 %{{[a-zA-Z0-9\.]*}}, label %polly.preload.exec, label %polly.preload.merge ; Check that we don't generate a runtime check because we treat all -; parameters as dereferencable. +; parameters as dereferenceable. ; CODE-NOT: polly.preload.cond: ; preds = %polly.preload.begin ; CODE-NOT: br i1 %{{r1:[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll index a473ef30376c1..e3292b4e4aefa 100644 --- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll +++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll @@ -1,7 +1,7 @@ ; RUN: opt %loadNPMPolly '-passes=print' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we only have one parameter and one invariant load for all -; three loads that occure in the region but actually access the same +; three loads that occur in the region but actually access the same ; location. Also check that the execution context is the most generic ; one, e.g., here the universal set. ; diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll index 66a0bc631b1dc..d69438de5817f 100644 --- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll +++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll @@ -1,7 +1,7 @@ ; RUN: opt %loadNPMPolly '-passes=print' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we only have one parameter and one invariant load for all -; three loads that occure in the region but actually access the same +; three loads that occur in the region but actually access the same ; location. Also check that the execution context is the most generic ; one, e.g., here the universal set. ; diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll index b31a0d0262db9..9daae4b1ce3db 100644 --- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll +++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll @@ -1,6 +1,6 @@ ; RUN: opt %loadNPMPolly '-passes=print' -disable-output < %s 2>&1 | FileCheck %s ; -; Verfy that we do not use the GetElementPtr information to delinearize A +; Verify that we do not use the GetElementPtr information to delinearize A ; because of the cast in-between. Use the single-dimensional modeling instead. ; ; void f(short A[][2]) { diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll index 3801fda4923c3..f0f1c2b1f39db 100644 --- a/polly/test/ScopInfo/multidim_many_references.ll +++ b/polly/test/ScopInfo/multidim_many_references.ll @@ -28,7 +28,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; This test case verifies that the construction of the assumed context finishes ; successfully. Depending on how constrained are accumulated in the assumed ; context, this test case can take even for a smaller number of arrays over a -; minute to complete. With the unrolling choosen in this test, an inefficient +; minute to complete. With the unrolling chosen in this test, an inefficient ; formulation of the assumption tracking cause LLVM to crash due to excessive ; memory usage due to an overly large number of disjuncts being formed. diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll index d64f1696c30b6..3f61d0d723046 100644 --- a/polly/test/ScopInfo/scalar_to_array.ll +++ b/polly/test/ScopInfo/scalar_to_array.ll @@ -109,7 +109,7 @@ return: ; preds = %for.cond ; It is not possible to have a scop which accesses a scalar element that is ; a global variable. All global variables are pointers containing possibly ; a single element. Hence they do not need to be handled anyways. -; Please note that this is still required when scalar to array rewritting is +; Please note that this is still required when scalar to array rewriting is ; disabled. ; CHECK-LABEL: Function: use_after_scop diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll index bd3749b6aa74f..cbe4af05169f8 100644 --- a/polly/test/ScopInfo/zero_ext_of_truncate.ll +++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll @@ -8,7 +8,7 @@ ; } ; } ; -; FIXME: The truncated value should be a paramter. +; FIXME: The truncated value should be a parameter. ; CHECK: Assumed Context: ; CHECK-NEXT: [N, tmp, M] -> { : } ; CHECK-NEXT: Invalid Context: diff --git a/polly/test/create_ll.sh b/polly/test/create_ll.sh index c44d8ed81f2b5..1c03d9a015e01 100755 --- a/polly/test/create_ll.sh +++ b/polly/test/create_ll.sh @@ -12,7 +12,7 @@ opt -correlated-propagation -mem2reg -instcombine -loop-simplify -indvars \ -instnamer ${LLFILE} -S -o ${LLFILE_TMP} # Insert a header into the new testcase containing a sample RUN line a FIXME and -# an XFAIL. Then insert the formated C code and finally the LLVM-IR without +# an XFAIL. Then insert the formatted C code and finally the LLVM-IR without # attributes, the module ID or the target triple. echo '; RUN: opt %loadPolly -S < %s | FileCheck %s' > ${LLFILE} echo ';' >> ${LLFILE} diff --git a/polly/utils/pyscop/isl.py b/polly/utils/pyscop/isl.py index c06b7bca28042..c5d92ae9812de 100644 --- a/polly/utils/pyscop/isl.py +++ b/polly/utils/pyscop/isl.py @@ -72,7 +72,7 @@ def initialize_isl_methods(self): if hasattr(self.__class__, "initialized"): return - self.__class__.initalized = True + self.__class__.initialized = True self.get_isl_method("read_from_str").argtypes = [Context, c_char_p, c_int] self.get_isl_method("copy").argtypes = [self.__class__] self.get_isl_method("copy").restype = c_int @@ -204,7 +204,7 @@ def initialize_isl_methods(self): if hasattr(self.__class__, "initialized"): return - self.__class__.initalized = True + self.__class__.initialized = True self.get_isl_method("copy").argtypes = [self.__class__] self.get_isl_method("copy").restype = c_int self.get_isl_method("free").argtypes = [self.__class__] diff --git a/polly/www/changelog.html b/polly/www/changelog.html index 8c789cd31f530..6ba9ba4723058 100644 --- a/polly/www/changelog.html +++ b/polly/www/changelog.html @@ -40,7 +40,7 @@

    3.6

  • Run-time alias checks
  • Computation of no-alias information for later LLVM optimizations (vectorizer, LICM, ...)
  • -
  • Support for multi-dimensional arrays of parameteric size (still tested)
  • +
  • Support for multi-dimensional arrays of parametric size (still tested)
  • New assumption tracking framework